[llvm] [LV][VPlan] Add initial support for CSA vectorization (PR #106560)

Michael Maitland via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 14 06:14:13 PDT 2024


https://github.com/michaelmaitland updated https://github.com/llvm/llvm-project/pull/106560

>From b69ad63d588f608fa3ad0530931b041c8956ba35 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 19 Aug 2024 12:34:43 -0700
Subject: [PATCH 01/23] [LV] Precommit csa vectorization test cases

---
 .../Transforms/LoopVectorize/RISCV/csa.ll     | 3520 +++++++++++++++++
 1 file changed, 3520 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/csa.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
new file mode 100644
index 00000000000000..71a5519522a275
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -0,0 +1,3520 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
+; RUN:   -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN:   | FileCheck %s -check-prefix=EVL
+; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
+; RUN:   -passes=loop-vectorize -force-tail-folding-style=none \
+; RUN:   | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
+; RUN:   -passes=loop-vectorize -force-tail-folding-style=data \
+; RUN:   | FileCheck %s -check-prefix=DATA
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data, int a) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
+; EVL-LABEL: @simple_csa_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %a, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int_select_induction_cmp(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float_select(int N, float *data) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define float @simple_csa_float_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_float_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret float [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %0, float %t.09
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int(int N, bool *cond, int *data) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_int(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float(int N, bool *cond, float *data) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_float(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_float(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_float(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret float [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data0[i])
+;       t = data0[i];
+;     if (a < data1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
+; EVL-LABEL: @csa_in_series_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_int_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = or i32 %s.1, %spec.select
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %a, %2
+  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx5, align 4
+  %4 = sext i32 %3 to i64
+  %cmp6 = icmp slt i64 %a, %4
+  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data0[i])
+;       t = data0[i];
+;     if (a < data1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = or i32 %s.1, %spec.select
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %2
+  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx5, align 4
+  %4 = sext i32 %3 to i64
+  %cmp6 = icmp slt i64 %indvars.iv, %4
+  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float_select(int N, float *data0,
+;                           float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data0[i])
+;       t = data0[i];
+;     if (0.0f <data1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret float [[ADD]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret float [[ADD]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_float_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret float [[ADD]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp19 = icmp sgt i32 %N, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %1, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %1, float %t.020
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx5, align 4
+  %cmp6 = fcmp ogt float %2, 0.000000e+00
+  %s.1 = select i1 %cmp6, float %2, float %s.021
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[IF_END]]
+; EVL:       if.end:
+; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL:       if.then6:
+; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_int(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[IF_END]]
+; NO-EVL:       if.end:
+; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL:       if.then6:
+; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_int(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[IF_END]]
+; DATA:       if.end:
+; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA:       if.then6:
+; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %4 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
+;                           float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret float [[ADD]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[IF_END]]
+; EVL:       if.end:
+; EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL:       if.then6:
+; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_float(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret float [[ADD]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[IF_END]]
+; NO-EVL:       if.end:
+; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL:       if.then6:
+; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_float(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret float [[ADD]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[IF_END]]
+; DATA:       if.end:
+; DATA-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA:       if.then6:
+; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %4 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int_select(int N, int *data0,
+;                                   int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data0[i])
+;       t = data0[i];
+;     if (i < data1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
+; EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; NO-EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
+; NO-EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; DATA-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
+; DATA-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx5, align 4
+  %3 = sext i32 %2 to i64
+  %cmp6 = icmp slt i64 %indvars.iv, %3
+  %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float_select(int N,
+;                                       float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data0[i])
+;       t = data0[i];
+;     if (0.0f < data1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; EVL-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret float [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; DATA-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; DATA-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp19 = icmp sgt i32 %N, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %0, float %t.020
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx5, align 4
+  %cmp6 = fcmp ogt float %1, 0.000000e+00
+  %t.2 = select i1 %cmp6, float %1, float %t.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+;                                   int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[IF_END]]
+; EVL:       if.end:
+; EVL-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL:       if.then6:
+; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[IF_END]]
+; NO-EVL:       if.end:
+; NO-EVL-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL:       if.then6:
+; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[IF_END]]
+; DATA:       if.end:
+; DATA-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA:       if.then6:
+; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %2, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
+;                                       float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[IF_END]]
+; EVL:       if.end:
+; EVL-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL:       if.then6:
+; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[IF_END]]
+; NO-EVL:       if.end:
+; NO-EVL-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL:       if.then6:
+; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret float [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[IF_END]]
+; DATA:       if.end:
+; DATA-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA:       if.then6:
+; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %2, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %3 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i]) {
+;       t = data0[i];
+;       s = data1[i];
+;     }
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_int(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_same_cond_int(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_same_cond_int(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
+  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i]) {
+;       t = data0[i];
+;       s = data1[i];
+;     }
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_float(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret float [[ADD]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_same_cond_float(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret float [[ADD]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_same_cond_float(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret float [[ADD]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %3 = load float, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
+  %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+;                                 int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_int(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; EVL:       if.else:
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; EVL:       for.inc.sink.split:
+; EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; NO-EVL:       if.else:
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; NO-EVL:       for.inc.sink.split:
+; NO-EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_same_scalar_int(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; DATA:       if.else:
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; DATA:       for.inc.sink.split:
+; DATA-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %1, 0
+  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split:                               ; preds = %if.else, %for.body
+  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.sink.split, %if.else
+  %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
+;                                     float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_float(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; EVL:       if.else:
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; EVL:       for.inc.sink.split:
+; EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; NO-EVL:       if.else:
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; NO-EVL:       for.inc.sink.split:
+; NO-EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_same_scalar_float(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret float [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; DATA:       if.else:
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; DATA:       for.inc.sink.split:
+; DATA-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %1, 0
+  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split:                               ; preds = %if.else, %for.body
+  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.sink.split, %if.else
+  %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_int(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       if.else:
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL:       if.then6:
+; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; EVL-NEXT:    [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_int(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       if.else:
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL:       if.then6:
+; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; NO-EVL-NEXT:    [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_int(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       if.else:
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA:       if.then6:
+; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; DATA-NEXT:    [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.else
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %4 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then6, %if.else
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+  %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
+;                         float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_float(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret float [[ADD]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; EVL:       if.then:
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       if.else:
+; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL:       if.then6:
+; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT:    br label [[FOR_INC]]
+; EVL:       for.inc:
+; EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; EVL-NEXT:    [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_float(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret float [[ADD]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL:       if.then:
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       if.else:
+; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL:       if.then6:
+; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT:    br label [[FOR_INC]]
+; NO-EVL:       for.inc:
+; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; NO-EVL-NEXT:    [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_float(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret float [[ADD]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; DATA:       if.then:
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       if.else:
+; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA:       if.then6:
+; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT:    br label [[FOR_INC]]
+; DATA:       for.inc:
+; DATA-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; DATA-NEXT:    [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.else
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %4 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then6, %if.else
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+  %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+;   uint64_t idx = ii;
+;   for (uint64_t i = 0; i < n; ++i)
+;     idx = (a[i] > b[i]) ? i : idx;
+;   return idx;
+; }
+define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @idx_scalar(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @idx_scalar(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; DATA-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; DATA-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp8.not = icmp eq i64 %n, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %cond.lcssa = phi i64 [ %cond, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i64 %idx.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
+  %1 = load i64, ptr %arrayidx1, align 8
+  %cmp2 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
+  %inc = add nuw i64 %i.010, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+;   uint64_t idx = ii;
+;   for (uint64_t i = n; i > 0; --i) // decreasing
+;      idx = (a[i - 1] > b[i - 1]) ? i : idx;
+;   return idx;
+; }
+define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @idx_scalar_dec(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @idx_scalar_dec(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
+; DATA-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; DATA-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp.not9 = icmp eq i64 %n, 0
+  br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %cond.lcssa = phi i64 [ %cond, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i64 %idx.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
+  %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+  %sub = add i64 %i.011, -1
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
+  %1 = load i64, ptr %arrayidx2, align 8
+  %cmp3 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
+  %cmp.not = icmp eq i64 %sub, 0
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; The key part of this function is that the true arm of the select corresponds
+; to selecting the initial value, instead of selecting the new value.
+; int simple_csa_int_select_neg_cond(int N, int *data) {
+;   int t = 0;
+;   for (int i = 0; i < N; i++) {
+;     if (i != data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_neg_cond(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int_select_neg_cond(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; DATA-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = zext i32 %0 to i64
+  %cmp1.not = icmp eq i64 %indvars.iv, %1
+  %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+;   int *t = nullptr;
+;   for (int i = 0; i < N; i++) {
+;     if (i < *data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_ptr_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_ptr_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret ptr %t.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+  %0 = load ptr, ptr %arrayidx, align 8
+  %1 = load i32, ptr %0, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %2
+  %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From 6b3d5ef7f3f92e94ee32a0709da08e909c1e3f57 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 8 Aug 2024 10:31:27 -0700
Subject: [PATCH 02/23] [CSA] Add CSADescriptors Analysis

---
 llvm/include/llvm/Analysis/CSADescriptors.h | 78 +++++++++++++++++++++
 llvm/lib/Analysis/CMakeLists.txt            |  1 +
 llvm/lib/Analysis/CSADescriptors.cpp        | 73 +++++++++++++++++++
 3 files changed, 152 insertions(+)
 create mode 100644 llvm/include/llvm/Analysis/CSADescriptors.h
 create mode 100644 llvm/lib/Analysis/CSADescriptors.cpp

diff --git a/llvm/include/llvm/Analysis/CSADescriptors.h b/llvm/include/llvm/Analysis/CSADescriptors.h
new file mode 100644
index 00000000000000..edd98777d84ab6
--- /dev/null
+++ b/llvm/include/llvm/Analysis/CSADescriptors.h
@@ -0,0 +1,78 @@
+//===- llvm/Analysis/CSADescriptors.h - CSA Descriptors --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file "describes" conditional scalar assignments (CSA).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+
+#ifndef LLVM_ANALYSIS_CSADESCRIPTORS_H
+#define LLVM_ANALYSIS_CSADESCRIPTORS_H
+
+namespace llvm {
+
+/// A Conditional Scalar Assignment (CSA) is an assignment from an initial
+/// scalar that may or may not occur.
+class CSADescriptor {
+  /// If the conditional assignment occurs inside a loop, then Phi chooses
+  /// the value of the assignment from the entry block or the loop body block.
+  PHINode *Phi = nullptr;
+
+  /// The initial value of the CSA. If the condition guarding the assignment is
+  /// not met, then the assignment retains this value.
+  Value *InitScalar = nullptr;
+
+  /// The Instruction that conditionally assigned to inside the loop.
+  Instruction *Assignment = nullptr;
+
+  /// Create a CSA Descriptor that models an invalid CSA.
+  CSADescriptor() = default;
+
+  /// Create a CSA Descriptor that models a valid CSA with its members
+  /// initialized correctly.
+  CSADescriptor(PHINode *Phi, Instruction *Assignment, Value *InitScalar)
+      : Phi(Phi), InitScalar(InitScalar), Assignment(Assignment) {}
+
+public:
+  /// If Phi is the root of a CSA, return the CSADescriptor of the CSA rooted by
+  /// Phi. Otherwise, return a CSADescriptor with IsValidCSA set to false.
+  static CSADescriptor isCSAPhi(PHINode *Phi, Loop *TheLoop);
+
+  operator bool() const { return isValid(); }
+
+  /// Returns whether SI is the Assignment in CSA
+  static bool isCSASelect(CSADescriptor Desc, SelectInst *SI) {
+    return Desc.getAssignment() == SI;
+  }
+
+  /// Return whether this CSADescriptor models a valid CSA.
+  bool isValid() const { return Phi && InitScalar && Assignment; }
+
+  /// Return the PHI that roots this CSA.
+  PHINode *getPhi() const { return Phi; }
+
+  /// Return the initial value of the CSA. This is the value if the conditional
+  /// assignment does not occur.
+  Value *getInitScalar() const { return InitScalar; }
+
+  /// The Instruction that is used after the loop
+  Instruction *getAssignment() const { return Assignment; }
+
+  /// Return the condition that this CSA is conditional upon.
+  Value *getCond() const {
+    if (auto *SI = dyn_cast_or_null<SelectInst>(Assignment))
+      return SI->getCondition();
+    return nullptr;
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_CSADESCRIPTORS_H
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 393803fad89383..24ca426990d9ed 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_component_library(LLVMAnalysis
   CostModel.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
+  CSADescriptors.cpp
   CtxProfAnalysis.cpp
   CycleAnalysis.cpp
   DDG.cpp
diff --git a/llvm/lib/Analysis/CSADescriptors.cpp b/llvm/lib/Analysis/CSADescriptors.cpp
new file mode 100644
index 00000000000000..d0377c8c16de33
--- /dev/null
+++ b/llvm/lib/Analysis/CSADescriptors.cpp
@@ -0,0 +1,73 @@
+//=== llvm/Analysis/CSADescriptors.cpp - CSA Descriptors -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file "describes" conditional scalar assignments (CSA).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CSADescriptors.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "csa-descriptors"
+
+CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
+  // Return CSADescriptor that describes a CSA that matches one of these
+  // patterns:
+  //   phi loop_inv, (select cmp, value, phi)
+  //   phi loop_inv, (select cmp, phi, value)
+  //   phi (select cmp, value, phi), loop_inv
+  //   phi (select cmp, phi, value), loop_inv
+  // If the CSA does not match any of these paterns, return a CSADescriptor
+  // that describes an InvalidCSA.
+
+  // Must be a scalar
+  Type *Type = Phi->getType();
+  if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+      !Type->isPointerTy())
+    return CSADescriptor();
+
+  // Match phi loop_inv, (select cmp, value, phi)
+  //    or phi loop_inv, (select cmp, phi, value)
+  //    or phi (select cmp, value, phi), loop_inv
+  //    or phi (select cmp, phi, value), loop_inv
+  if (Phi->getNumIncomingValues() != 2)
+    return CSADescriptor();
+  auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](Use &U) {
+    return match(U.get(), m_Select(m_Value(), m_Specific(Phi), m_Value())) ||
+           match(U.get(), m_Select(m_Value(), m_Value(), m_Specific(Phi)));
+  });
+  if (SelectInstIt == Phi->incoming_values().end())
+    return CSADescriptor();
+  auto LoopInvIt = find_if(Phi->incoming_values(), [&](Use &U) {
+    return U.get() != *SelectInstIt && TheLoop->isLoopInvariant(U.get());
+  });
+  if (LoopInvIt == Phi->incoming_values().end())
+    return CSADescriptor();
+
+  // Phi or Sel must be used only outside the loop,
+  // excluding if Phi use Sel or Sel use Phi
+  auto IsOnlyUsedOutsideLoop = [=](Value *V, Value *Ignore) {
+    return all_of(V->users(), [Ignore, TheLoop](User *U) {
+      if (U == Ignore)
+        return true;
+      if (auto *I = dyn_cast<Instruction>(U))
+        return !TheLoop->contains(I);
+      return true;
+    });
+  };
+  auto *Sel = cast<SelectInst>(SelectInstIt->get());
+  auto *LoopInv = LoopInvIt->get();
+  if (!IsOnlyUsedOutsideLoop(Phi, Sel) || !IsOnlyUsedOutsideLoop(Sel, Phi))
+    return CSADescriptor();
+
+  return CSADescriptor(Phi, Sel, LoopInv);
+}

>From 97ef89bfc760904b9166586cd58df5a26caf3fe3 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 8 Aug 2024 11:16:32 -0700
Subject: [PATCH 03/23] [LVL][CSA] Legalize CSA vectorization

---
 .../llvm/Analysis/TargetTransformInfo.h       |  9 +++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 ++
 .../Vectorize/LoopVectorizationLegality.h     | 18 ++++++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 +++
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  5 +++
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  4 +++
 .../Vectorize/LoopVectorizationLegality.cpp   | 34 ++++++++++++++++---
 7 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 89a85bc8a90864..affa9159ffa473 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1773,6 +1773,10 @@ class TargetTransformInfo {
         : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {}
   };
 
+  /// \returns true if the loop vectorizer should vectorize conditional
+  /// scalar assignments for the target.
+  bool enableCSAVectorization() const;
+
   /// \returns How the target needs this vector-predicated operation to be
   /// transformed.
   VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
@@ -2182,6 +2186,7 @@ class TargetTransformInfo::Concept {
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                                      Align Alignment) const = 0;
+  virtual bool enableCSAVectorization() const = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2952,6 +2957,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
   }
 
+  bool enableCSAVectorization() const override {
+    return Impl.enableCSAVectorization();
+  }
+
   VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
     return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index eca8818cc25e62..871bd6cedb5427 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -968,6 +968,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool enableCSAVectorization() const { return false; }
+
   TargetTransformInfo::VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const {
     return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..78d6a6cb7a59f8 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -27,6 +27,7 @@
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/CSADescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -269,6 +270,10 @@ class LoopVectorizationLegality {
   /// induction descriptor.
   using InductionList = MapVector<PHINode *, InductionDescriptor>;
 
+  /// CSAList contains the CSA descriptors for all the CSAs that were found
+  /// in the loop, rooted by their phis.
+  using CSAList = MapVector<PHINode *, CSADescriptor>;
+
   /// RecurrenceSet contains the phi nodes that are recurrences other than
   /// inductions and reductions.
   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
@@ -321,6 +326,12 @@ class LoopVectorizationLegality {
   /// Returns True if V is a Phi node of an induction variable in this loop.
   bool isInductionPhi(const Value *V) const;
 
+  /// Returns the CSAs found in the loop.
+  const CSAList &getCSAs() const { return CSAs; }
+
+  /// Returns true if Phi is the root of a CSA in the loop.
+  bool isCSAPhi(PHINode *Phi) const { return CSAs.count(Phi) != 0; }
+
   /// Returns a pointer to the induction descriptor, if \p Phi is an integer or
   /// floating point induction.
   const InductionDescriptor *getIntOrFpInductionDescriptor(PHINode *Phi) const;
@@ -545,6 +556,10 @@ class LoopVectorizationLegality {
   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
                        SmallPtrSetImpl<Value *> &AllowedExit);
 
+  // Updates the vetorization state by adding \p Phi to the CSA list.
+  void addCSAPhi(PHINode *Phi, const CSADescriptor &CSADesc,
+                 SmallPtrSetImpl<Value *> &AllowedExit);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -589,6 +604,9 @@ class LoopVectorizationLegality {
   /// variables can be pointers.
   InductionList Inductions;
 
+  /// Holds the conditional scalar assignments
+  CSAList CSAs;
+
   /// Holds all the casts that participate in the update chain of the induction
   /// variables, and that have been proven to be redundant (possibly under a
   /// runtime guard). These casts can be ignored when creating the vectorized
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b5195f764cbd1c..f8f96473fce7d3 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1310,6 +1310,10 @@ bool TargetTransformInfo::preferEpilogueVectorization() const {
   return TTIImpl->preferEpilogueVectorization();
 }
 
+bool TargetTransformInfo::enableCSAVectorization() const {
+  return TTIImpl->enableCSAVectorization();
+}
+
 TargetTransformInfo::VPLegalization
 TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
   return TTIImpl->getVPLegalizationStrategy(VPI);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e041854ee8fd6e..589441d3ccb121 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2217,6 +2217,11 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
 
+bool RISCVTTIImpl::enableCSAVectorization() const {
+  return ST->hasVInstructions() &&
+         ST->getProcFamily() == RISCVSubtarget::SiFive7;
+}
+
 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
   auto *VTy = dyn_cast<VectorType>(DataTy);
   if (!VTy || VTy->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 65bbd905508557..6ed773b520a4f8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -301,6 +301,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return TLI->isVScaleKnownToBeAPowerOfTwo();
   }
 
+  /// \returns true if the loop vectorizer should vectorize conditional
+  /// scalar assignments for the target.
+  bool enableCSAVectorization() const;
+
   /// \returns How the target needs this vector-predicated operation to be
   /// transformed.
   TargetTransformInfo::VPLegalization
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ec67fc05ffcf78 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,10 @@ static cl::opt<bool> EnableHistogramVectorization(
     "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
     cl::desc("Enables autovectorization of some loops containing histograms"));
 
+static cl::opt<bool>
+    EnableCSA("enable-csa-vectorization", cl::init(false), cl::Hidden,
+              cl::desc("Control whether CSA loop vectorization is enabled"));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -753,6 +757,15 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() {
   return llvm::all_of(Header->phis(), IsSupportedPhi);
 }
 
+void LoopVectorizationLegality::addCSAPhi(
+    PHINode *Phi, const CSADescriptor &CSADesc,
+    SmallPtrSetImpl<Value *> &AllowedExit) {
+  assert(CSADesc.isValid() && "Expected Valid CSADescriptor");
+  LLVM_DEBUG(dbgs() << "LV: found legal CSA opportunity" << *Phi << "\n");
+  AllowedExit.insert(Phi);
+  CSAs.insert({Phi, CSADesc});
+}
+
 /// Checks if a function is scalarizable according to the TLI, in
 /// the sense that it should be vectorized and then expanded in
 /// multiple scalar calls. This is represented in the
@@ -870,14 +883,23 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        // As a last resort, coerce the PHI to a AddRec expression
-        // and re-try classifying it a an induction PHI.
+        // Try to coerce the PHI to a AddRec expression and re-try classifying
+        // it a an induction PHI.
         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
             !IsDisallowedStridedPointerInduction(ID)) {
           addInductionPhi(Phi, ID, AllowedExit);
           continue;
         }
 
+        // Check if the PHI can be classified as a CSA PHI.
+        if (EnableCSA || (TTI->enableCSAVectorization() &&
+                          EnableCSA.getNumOccurrences() == 0)) {
+          if (auto CSADesc = CSADescriptor::isCSAPhi(Phi, TheLoop)) {
+            addCSAPhi(Phi, CSADesc, AllowedExit);
+            continue;
+          }
+        }
+
         reportVectorizationFailure("Found an unidentified PHI",
             "value that could not be identified as "
             "reduction is used outside the loop",
@@ -1849,11 +1871,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   for (const auto &Reduction : getReductionVars())
     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
 
+  SmallPtrSet<const Value *, 8> CSALiveOuts;
+  for (const auto &CSA : getCSAs())
+    CSALiveOuts.insert(CSA.second.getAssignment());
+
   // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
     // Check that all users of allowed exit values are inside the loop or
-    // are the live-out of a reduction.
-    if (ReductionLiveOuts.count(AE))
+    // are the live-out of a reduction or a CSA
+    if (ReductionLiveOuts.count(AE) || CSALiveOuts.count(AE))
       continue;
     for (User *U : AE->users()) {
       Instruction *UI = cast<Instruction>(U);

>From c295e368cb105401e8520581f9f5bd073f99bc9d Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 15 Aug 2024 11:00:06 -0700
Subject: [PATCH 04/23] [LV] Build VPlan for CSA

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  189 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |    7 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  184 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  269 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   44 +
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    3 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |    6 +-
 .../Transforms/LoopVectorize/RISCV/csa.ll     | 2457 ++++++++++++++---
 8 files changed, 2776 insertions(+), 383 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b1077d37b4cdc7..e2f11543c564ed 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(CSAsVectorized,
+          "Number of conditional scalar assignments vectorized");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -493,6 +495,10 @@ class InnerLoopVectorizer {
   virtual std::pair<BasicBlock *, Value *>
   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
 
+  /// For all vectorized CSAs, replace uses of live-out scalar from the orignal
+  /// loop with the extracted scalar from the vector loop for.
+  void fixCSALiveOuts(VPTransformState &State, VPlan &Plan);
+
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
 
@@ -2914,6 +2920,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                    TargetTransformInfo::TCK_RecipThroughput);
 }
 
+void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
+  for (const auto &CSA : Plan.getCSAStates()) {
+    VPCSADataUpdateRecipe *VPDataUpdate = CSA.second->getDataUpdate();
+    assert(VPDataUpdate &&
+           "VPDataUpdate must have been introduced prior to fixing live outs");
+    Value *V = VPDataUpdate->getUnderlyingValue();
+    Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
+                                       /*NeedsScalar=*/true);
+    // Fix LCSSAPhis
+    llvm::SmallPtrSet<PHINode *, 2> ToFix;
+    for (User *U : V->users())
+      if (auto *Phi = dyn_cast<PHINode>(U);
+          Phi && Phi->getParent() == LoopExitBlock)
+        ToFix.insert(Phi);
+    for (PHINode *Phi : ToFix)
+      Phi->addIncoming(ExtractedScalar, LoopMiddleBlock);
+  }
+}
+
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                             VPlan &Plan) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2950,6 +2975,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
       fixupIVUsers(Entry.first, Entry.second,
                    getOrCreateVectorTripCount(nullptr),
                    IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
+    fixCSALiveOuts(State, Plan);
   }
 
   // Fix live-out phis not already fixed earlier.
@@ -4461,6 +4487,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPEVLBasedIVPHISC:
       case VPDef::VPPredInstPHISC:
       case VPDef::VPBranchOnMaskSC:
+      case VPRecipeBase::VPCSADataUpdateSC:
+      case VPRecipeBase::VPCSAExtractScalarSC:
+      case VPRecipeBase::VPCSAHeaderPHISC:
         continue;
       case VPDef::VPReductionSC:
       case VPDef::VPActiveLaneMaskPHISC:
@@ -8562,9 +8591,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
       return Recipe;
 
     VPHeaderPHIRecipe *PhiRecipe = nullptr;
-    assert((Legal->isReductionVariable(Phi) ||
-            Legal->isFixedOrderRecurrence(Phi)) &&
-           "can only widen reductions and fixed-order recurrences here");
     VPValue *StartV = Operands[0];
     if (Legal->isReductionVariable(Phi)) {
       const RecurrenceDescriptor &RdxDesc =
@@ -8574,12 +8600,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
                                            CM.isInLoopReduction(Phi),
                                            CM.useOrderedReductions(RdxDesc));
-    } else {
+    } else if (Legal->isFixedOrderRecurrence(Phi)) {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
       // recurrences in the chain, the fixed order recurrence should be modeled
       // directly, enabling more efficient codegen.
       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
+    } else if (Legal->isCSAPhi(Phi)) {
+      VPCSAState *State = Plan.getCSAStates().find(Phi)->second;
+      VPValue *InitData = State->getVPInitData();
+      // When the VF=getFixed(1), InitData is just InitScalar.
+      if (!InitData)
+        InitData = State->getVPInitScalar();
+      PhiRecipe = new VPCSAHeaderPHIRecipe(Phi, InitData);
+      State->setPhiRecipe(cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
+    } else {
+      llvm_unreachable(
+          "can only widen reductions, fixed-order recurrences, and CSAs here");
     }
 
     PhisToFix.push_back(PhiRecipe);
@@ -8613,6 +8650,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                 make_range(Operands.begin(), Operands.end()));
 
   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
+    auto *CSADescIt = find_if(Legal->getCSAs(), [&](auto CSA) {
+      return CSADescriptor::isCSASelect(CSA.second, SI);
+    });
+    if (CSADescIt != Legal->getCSAs().end()) {
+      PHINode *CSAPhi = CSADescIt->first;
+      VPCSAState *State = Plan.getCSAStates().find(CSAPhi)->second;
+      VPValue *VPDataPhi = State->getPhiRecipe();
+      auto *R = new VPCSADataUpdateRecipe(
+          SI, {VPDataPhi, Operands[0], Operands[1], Operands[2]});
+      State->setDataUpdate(R);
+      return R;
+    }
+
     return new VPWidenSelectRecipe(
         *SI, make_range(Operands.begin(), Operands.end()));
   }
@@ -8625,6 +8675,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+/// Add CSA Recipes that can occur before each instruction in the input IR
+/// is processed and introduced into VPlan.
+static void
+addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
+                        Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
+                        VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
+                        VPlan &Plan) {
+
+  // Don't build full CSA for VF=ElementCount::getFixed(1)
+  bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
+      [&](ElementCount VF) { return VF.isScalar(); }, Range);
+
+  for (const auto &CSA : CSAs) {
+    VPValue *VPInitScalar = Plan.getOrAddLiveIn(
+        CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+
+    // Scalar VF builds the scalar version of the loop. In that case,
+    // no maintenence of mask nor extraction in middle block is needed.
+    if (IsScalarVF) {
+      VPCSAState *S = new VPCSAState(VPInitScalar);
+      Plan.addCSAState(CSA.first, S);
+      continue;
+    }
+
+    auto *VPInitMask =
+        new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
+    auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
+                                         {VPInitScalar}, DL, "csa.init.data");
+    PreheaderVPBB->appendRecipe(VPInitMask);
+    PreheaderVPBB->appendRecipe(VPInitData);
+
+    auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
+                                        DL, "csa.mask.phi");
+    HeaderVPBB->appendRecipe(VPMaskPhi);
+
+    auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
+    Plan.addCSAState(CSA.first, S);
+  }
+}
+
+/// Add CSA Recipes that must occur after each instruction in the input IR
+/// is processed and introduced into VPlan.
+static void
+addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
+                         const LoopVectorizationLegality::CSAList &CSAs,
+                         VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
+                         VPlan &Plan) {
+  // Don't build CSA for VF=ElementCount::getFixed(1)
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) { return VF.isScalar(); }, Range))
+    return;
+
+  for (const auto &CSA : CSAs) {
+    VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
+    VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
+
+    assert(VPDataUpdate &&
+           "VPDataUpdate must have been introduced prior to postprocess");
+    assert(CSA.second.getCond() &&
+           "CSADescriptor must know how to describe the condition");
+    auto GetVPValue = [&](Value *I) {
+      return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
+    };
+    VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
+    VPValue *VPInitScalar = CSAState->getVPInitScalar();
+
+    // The CSA optimization wants to use a condition such that when it is
+    // true, a new value is assigned. However, it is possible that a true lane
+    // in WidenedCond corresponds to selection of the initial value instead.
+    // In that case, we must use the negation of WidenedCond.
+    // i.e. select cond new_val old_val versus select cond.not old_val new_val
+    VPValue *CondToUse = WidenedCond;
+    if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
+        CSA.first) {
+      auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
+      VPNotCond->insertBefore(
+          GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
+      CondToUse = VPNotCond;
+    }
+
+    auto *VPAnyActive = new VPInstruction(
+        VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
+    VPAnyActive->insertBefore(
+        GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
+
+    auto *VPMaskSel = new VPInstruction(
+        VPInstruction::CSAMaskSel,
+        {CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
+    VPMaskSel->insertAfter(VPAnyActive);
+    VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
+    VPCSAExtractScalarRecipe *ExtractScalarRecipe =
+        new VPCSAExtractScalarRecipe({VPInitScalar, VPMaskSel, VPDataUpdate});
+
+    MiddleVPBB->insert(ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi());
+
+    // Update CSAState with new recipes
+    CSAState->setExtractScalarRecipe(ExtractScalarRecipe);
+    CSAState->setVPAnyActive(VPAnyActive);
+  }
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8682,7 +8833,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 // increments.
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
-    const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+    const MapVector<PHINode *, InductionDescriptor> &Inductions,
+    const MapVector<PHINode *, CSADescriptor> &CSAs) {
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8716,6 +8868,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
            return P && Inductions.contains(P);
          })))
       continue;
+    // Exit values for CSAs are computed and updated outside of VPlan and
+    // independent of induction recipes.
+    // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
+    // live-outs.
+    if (isa<VPCSADataUpdateRecipe>(V) &&
+        (isa<Instruction>(IncomingValue) &&
+         any_of(IncomingValue->users(), [&CSAs](User *U) {
+           auto *P = dyn_cast<PHINode>(U);
+           return P && CSAs.contains(P);
+         })))
+      continue;
     ExitUsersToFix.insert(ExitIRI);
     ExitIRI->addOperand(V);
   }
@@ -8929,6 +9092,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
+  addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
+                          Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
+                          Range, *Plan);
+
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
 
   // ---------------------------------------------------------------------------
@@ -9035,6 +9202,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
 
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
+  addCSAPostprocessRecipes(RecipeBuilder, Legal->getCSAs(), MiddleVPBB, DL,
+                           Range, *Plan);
+
   // After here, VPBB should not be used.
   VPBB = nullptr;
 
@@ -9044,8 +9216,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+  SetVector<VPIRInstruction *> ExitUsersToFix =
+      collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
+                              Legal->getInductionVars(), Legal->getCSAs());
   addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
   addUsersInExitBlock(*Plan, ExitUsersToFix);
 
@@ -10145,6 +10318,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
+        CSAsVectorized += LVL.getCSAs().size();
 
         // Second pass vectorizes the epilogue and adjusts the control flow
         // edges from the first pass.
@@ -10239,6 +10413,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                PSI, Checks);
         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
+        CSAsVectorized += LVL.getCSAs().size();
 
         // Add metadata to disable runtime unrolling a scalar loop when there
         // are no runtime checks about strides and memory. A scalar loop that is
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 6ddbfcf0ecfe58..e62e29b57f258b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,7 +217,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
 
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
-  while (It != end() && It->isPhi())
+  while (It != end() && vputils::isPhi(*It))
     It++;
   return It;
 }
@@ -861,6 +861,9 @@ VPlan::~VPlan() {
     delete VPV;
   if (BackedgeTakenCount)
     delete BackedgeTakenCount;
+
+  for (std::pair<PHINode *, VPCSAState *> &S : CSAStates)
+    delete S.second;
 }
 
 static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) {
@@ -1063,7 +1066,7 @@ void VPlan::execute(VPTransformState *State) {
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
     // Skip phi-like recipes that generate their backedege values themselves.
-    if (isa<VPWidenPHIRecipe>(&R))
+    if (vputils::isPhiThatGeneratesBackedge(R))
       continue;
 
     if (isa<VPWidenPointerInductionRecipe>(&R) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c4567362eaffc7..e4fc09a4d650fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -231,6 +231,53 @@ class VPLane {
   }
 };
 
+class VPInstruction;
+class VPCSAHeaderPHIRecipe;
+class VPCSADataUpdateRecipe;
+class VPCSAExtractScalarRecipe;
+
+/// VPCSAState holds information required to vectorize a conditional scalar
+/// assignment.
+class VPCSAState {
+  VPValue *VPInitScalar = nullptr;
+  VPInstruction *VPInitData = nullptr;
+  VPInstruction *VPMaskPhi = nullptr;
+  VPInstruction *VPAnyActive = nullptr;
+  VPCSAHeaderPHIRecipe *VPPhiRecipe = nullptr;
+  VPCSADataUpdateRecipe *VPDataUpdate = nullptr;
+  VPCSAExtractScalarRecipe *VPExtractScalar = nullptr;
+
+public:
+  VPCSAState(VPValue *VPInitScalar, VPInstruction *InitData,
+             VPInstruction *MaskPhi)
+      : VPInitScalar(VPInitScalar), VPInitData(InitData), VPMaskPhi(MaskPhi) {}
+
+  VPCSAState(VPValue *VPInitScalar) : VPInitScalar(VPInitScalar) {}
+
+  VPValue *getVPInitScalar() const { return VPInitScalar; }
+
+  VPInstruction *getVPInitData() const { return VPInitData; }
+
+  VPInstruction *getVPMaskPhi() const { return VPMaskPhi; }
+
+  void setVPAnyActive(VPInstruction *AnyActive) { VPAnyActive = AnyActive; }
+  VPInstruction *getVPAnyActive() { return VPAnyActive; }
+
+  VPCSAHeaderPHIRecipe *getPhiRecipe() const { return VPPhiRecipe; }
+
+  void setPhiRecipe(VPCSAHeaderPHIRecipe *R) { VPPhiRecipe = R; }
+
+  VPCSADataUpdateRecipe *getDataUpdate() const { return VPDataUpdate; }
+  void setDataUpdate(VPCSADataUpdateRecipe *R) { VPDataUpdate = R; }
+
+  void setExtractScalarRecipe(VPCSAExtractScalarRecipe *R) {
+    VPExtractScalar = R;
+  }
+  VPCSAExtractScalarRecipe *getExtractScalarRecipe() const {
+    return VPExtractScalar;
+  }
+};
+
 /// VPTransformState holds information passed down when "executing" a VPlan,
 /// needed for generating the output IR.
 struct VPTransformState {
@@ -899,6 +946,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPScalarCastSC:
+    case VPRecipeBase::VPCSAHeaderPHISC:
+    case VPRecipeBase::VPCSADataUpdateSC:
+    case VPRecipeBase::VPCSAExtractScalarSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
@@ -1245,6 +1295,14 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    CSAInitMask,
+    CSAInitData,
+    CSAMaskPhi,
+    CSAMaskSel,
+    CSAVLPhi,
+    CSAVLSel,
+    CSAAnyActive,
+    CSAAnyActiveEVL,
   };
 
 private:
@@ -2581,6 +2639,110 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
   }
 };
 
+class VPCSAHeaderPHIRecipe final : public VPHeaderPHIRecipe {
+public:
+  VPCSAHeaderPHIRecipe(PHINode *Phi, VPValue *VPInitData)
+      : VPHeaderPHIRecipe(VPDef::VPCSAHeaderPHISC, Phi, VPInitData) {}
+
+  ~VPCSAHeaderPHIRecipe() override = default;
+
+  VPCSAHeaderPHIRecipe *clone() override {
+    return new VPCSAHeaderPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+                                    getOperand(0));
+  }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  VP_CLASSOF_IMPL(VPDef::VPCSAHeaderPHISC)
+
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPDef::VPCSAHeaderPHISC;
+  }
+
+  VPValue *getVPInitData() { return getOperand(0); }
+  VPValue *getVPNewData() { return getOperand(1); }
+};
+
+class VPCSADataUpdateRecipe final : public VPSingleDefRecipe {
+public:
+  VPCSADataUpdateRecipe(SelectInst *SI, ArrayRef<VPValue *> Operands)
+      : VPSingleDefRecipe(VPDef::VPCSADataUpdateSC, Operands, SI) {}
+
+  ~VPCSADataUpdateRecipe() override = default;
+
+  VPCSADataUpdateRecipe *clone() override {
+    SmallVector<VPValue *> Ops(operands());
+    return new VPCSADataUpdateRecipe(cast<SelectInst>(getUnderlyingInstr()),
+                                     Ops);
+  }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  VP_CLASSOF_IMPL(VPDef::VPCSADataUpdateSC)
+
+  VPValue *getVPDataPhi() const { return getOperand(0); }
+
+  // The condition from the original select statement
+  VPValue *getVPCond() const { return getOperand(1); }
+
+  // The true value from the original select statement
+  VPValue *getVPTrue() const { return getOperand(2); }
+
+  // The false value from the original select statement
+  VPValue *getVPFalse() const { return getOperand(3); }
+
+  // We combine the setters so we can be sure NewMask is before AnyActive
+  // in the operands list, so the getters can be sure which operand numbers
+  // to get.
+  void setVPNewMaskAndVPAnyActive(VPValue *NewMask, VPValue *AnyActive) {
+    addOperand(NewMask);
+    addOperand(AnyActive);
+  }
+
+  VPValue *getVPNewMask() const { return getOperand(4); }
+
+  VPValue *getVPAnyActive() const { return getOperand(5); }
+};
+
+class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
+public:
+  VPCSAExtractScalarRecipe(ArrayRef<VPValue *> Operands)
+      : VPSingleDefRecipe(VPDef::VPCSAExtractScalarSC, Operands) {}
+
+  ~VPCSAExtractScalarRecipe() override = default;
+
+  VPCSAExtractScalarRecipe *clone() override {
+    SmallVector<VPValue *> Ops(operands());
+    return new VPCSAExtractScalarRecipe(Ops);
+  }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  VPValue *getVPInitScalar() const { return getOperand(0); }
+  VPValue *getVPMaskSel() const { return getOperand(1); }
+  VPValue *getVPDataSel() const { return getOperand(2); }
+  VPValue *getVPCSAVLSel() const { return getOperand(3); }
+  bool usesEVL() { return getNumOperands() == 4; }
+};
+
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
 /// order to merge values that are set under such a branch and feed their uses.
@@ -3497,6 +3659,8 @@ class VPlan {
   /// live-outs are fixed via VPLiveOut::fixPhi.
   MapVector<PHINode *, VPLiveOut *> LiveOuts;
 
+  MapVector<PHINode *, VPCSAState *> CSAStates;
+
   /// Mapping from SCEVs to the VPValues representing their expansions.
   /// NOTE: This mapping is temporary and will be removed once all users have
   /// been modeled in VPlan directly.
@@ -3541,6 +3705,12 @@ class VPlan {
                                      bool RequiresScalarEpilogueCheck,
                                      bool TailFolded, Loop *TheLoop);
 
+  void addCSAState(PHINode *Phi, VPCSAState *S) { CSAStates.insert({Phi, S}); }
+
+  MapVector<PHINode *, VPCSAState *> const &getCSAStates() const {
+    return CSAStates;
+  }
+
   /// Prepare the plan for execution, setting up the required live-in values.
   void prepareToExecute(Value *TripCount, Value *VectorTripCount,
                         Value *CanonicalIVStartValue, VPTransformState &State);
@@ -4005,6 +4175,20 @@ class VPlanSlp {
   /// Return true if all visited instruction can be combined.
   bool isCompletelySLP() const { return CompletelySLP; }
 };
+
+namespace vputils {
+
+/// Returns true for PHI-like recipes.
+bool isPhi(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that generate their own backedge
+bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that exists in vector loop header basic
+/// block
+bool isHeaderPhi(const VPRecipeBase &R);
+} // end namespace vputils
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 75908638532950..8380e946a8ebb6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -388,6 +388,10 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::CSAVLSel:
+  case VPInstruction::CSAVLPhi:
+  case VPInstruction::CSAAnyActive:
+  case VPInstruction::CSAAnyActiveEVL:
     return true;
   default:
     return false;
@@ -669,6 +673,94 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
+  case VPInstruction::CSAInitMask: {
+    if (Part == 0) {
+      Value *InitMask = ConstantAggregateZero::get(VectorType::get(
+          Type::getInt1Ty(State.Builder.getContext()), State.VF));
+      State.set(this, InitMask, Part);
+      return InitMask;
+    }
+    Value *V = State.get(this, Part - 1);
+    return V;
+  }
+  case VPInstruction::CSAInitData: {
+    if (Part == 0) {
+      Type *ElemTyp = getOperand(0)->getUnderlyingValue()->getType();
+      Value *InitData = PoisonValue::get(VectorType::get(ElemTyp, State.VF));
+      State.set(this, InitData, Part);
+      return InitData;
+    }
+    Value *V = State.get(this, Part - 1);
+    return V;
+  }
+  case VPInstruction::CSAMaskPhi: {
+    if (Part == 0) {
+      IRBuilder<>::InsertPointGuard Guard(State.Builder);
+      State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+      BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+      Value *InitMask = State.get(getOperand(0), Part);
+      PHINode *MaskPhi =
+          State.Builder.CreatePHI(InitMask->getType(), 2, "csa.mask.phi");
+      MaskPhi->addIncoming(InitMask, PreheaderBB);
+      State.set(this, MaskPhi, Part);
+      return MaskPhi;
+    }
+    Value *V = State.get(this, Part - 1);
+    return V;
+  }
+  case VPInstruction::CSAMaskSel: {
+    Value *WidenedCond = State.get(getOperand(0), Part);
+    Value *MaskPhi = State.get(getOperand(1), Part);
+    Value *AnyActive = State.get(getOperand(2), Part, /*NeedsScalar=*/true);
+    // If not the first Part, use the mask from the previous unrolled Part
+    Value *OldMask = Part == 0 ? MaskPhi : State.get(this, Part - 1);
+    Value *MaskSel = State.Builder.CreateSelect(AnyActive, WidenedCond, OldMask,
+                                                "csa.mask.sel");
+    // MaskPhi wants to use the most recently updated mask. That's the one
+    // that corresponds to the last Part.
+    if (Part == State.UF - 1)
+      cast<PHINode>(MaskPhi)->addIncoming(MaskSel, State.CFG.PrevBB);
+    return MaskSel;
+  }
+  case VPInstruction::CSAAnyActive: {
+    Value *WidenedCond = State.get(getOperand(0), Part);
+    return Builder.CreateOrReduce(WidenedCond);
+  }
+  case VPInstruction::CSAAnyActiveEVL: {
+    Value *WidenedCond = State.get(getOperand(0), Part);
+    Value *AllOnesMask = Constant::getAllOnesValue(
+        VectorType::get(Type::getInt1Ty(State.Builder.getContext()), State.VF));
+    Value *EVL = State.get(getOperand(1), Part, /*NeedsScalar=*/true);
+
+    Value *StartValue =
+        ConstantInt::get(WidenedCond->getType()->getScalarType(), 0);
+    Value *AnyActive = State.Builder.CreateIntrinsic(
+        WidenedCond->getType()->getScalarType(), Intrinsic::vp_reduce_or,
+        {StartValue, WidenedCond, AllOnesMask, EVL}, nullptr,
+        "csa.cond.anyactive");
+    return AnyActive;
+  }
+  case VPInstruction::CSAVLPhi: {
+    IRBuilder<>::InsertPointGuard Guard(State.Builder);
+    State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+    BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+
+    // InitVL can be anything since it won't be used if no mask was active
+    Value *InitVL = ConstantInt::get(State.Builder.getInt32Ty(), 0);
+    PHINode *VLPhi =
+        State.Builder.CreatePHI(InitVL->getType(), 2, "csa.vl.phi");
+    VLPhi->addIncoming(InitVL, PreheaderBB);
+    return VLPhi;
+  }
+  case VPInstruction::CSAVLSel: {
+    Value *AnyActive = State.get(getOperand(0), Part, /*NeedsScalar=*/true);
+    Value *VLPhi = State.get(getOperand(1), Part, /*NeedsScalar=*/true);
+    Value *EVL = State.get(getOperand(2), Part, /*NeedsScalar=*/true);
+    Value *VLSel =
+        State.Builder.CreateSelect(AnyActive, EVL, VLPhi, "csa.vl.sel");
+    cast<PHINode>(VLPhi)->addIncoming(VLSel, State.CFG.PrevBB);
+    return VLSel;
+  }
 
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -677,11 +769,16 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::CSAAnyActive ||
+         getOpcode() == VPInstruction::CSAAnyActiveEVL;
 }
 
 bool VPInstruction::isSingleScalar() const {
-  return getOpcode() == VPInstruction::ResumePhi;
+  return getOpcode() == VPInstruction::ResumePhi ||
+         getOpcode() == VPInstruction::CSAVLPhi ||
+         getOpcode() == VPInstruction::CSAVLSel ||
+         getOpcode() == VPInstruction::ExplicitVectorLength;
 }
 
 #if !defined(NDEBUG)
@@ -834,6 +931,30 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::CSAInitMask:
+    O << "csa-init-mask";
+    break;
+  case VPInstruction::CSAInitData:
+    O << "csa-init-data";
+    break;
+  case VPInstruction::CSAMaskPhi:
+    O << "csa-mask-phi";
+    break;
+  case VPInstruction::CSAMaskSel:
+    O << "csa-mask-sel";
+    break;
+  case VPInstruction::CSAVLPhi:
+    O << "csa-vl-phi";
+    break;
+  case VPInstruction::CSAVLSel:
+    O << "csa-vl-sel";
+    break;
+  case VPInstruction::CSAAnyActive:
+    O << "csa-anyactive";
+    break;
+  case VPInstruction::CSAAnyActiveEVL:
+    O << "csa-anyactive-evl";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -2141,6 +2262,123 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCSAHeaderPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                 VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = csa-data-phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
+  // PrevBB is this BB
+  IRBuilder<>::InsertPointGuard Guard(State.Builder);
+  State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+
+  Value *InitData = State.get(getVPInitData(), 0);
+  PHINode *DataPhi =
+      State.Builder.CreatePHI(InitData->getType(), 2, "csa.data.phi");
+  BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+  DataPhi->addIncoming(InitData, PreheaderBB);
+  // Note: We didn't add Incoming for the new data since VPCSADataUpdateRecipe
+  // may not have been executed. We let VPCSADataUpdateRecipe::execute add the
+  // incoming operand to DataPhi.
+
+  // Use the same DataPhi for all Parts
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    State.set(this, DataPhi, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCSADataUpdateRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = csa-data-update ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *AnyActive = State.get(getVPAnyActive(), Part, /*NeedsScalar=*/true);
+    Value *DataUpdate = getVPDataPhi() == getVPTrue()
+                            ? State.get(getVPFalse(), Part)
+                            : State.get(getVPTrue(), Part);
+    PHINode *DataPhi = cast<PHINode>(State.get(getVPDataPhi(), Part));
+    // If not the first Part, use the mask from the previous unrolled Part
+    Value *OldData = Part == 0 ? DataPhi : State.get(this, Part - 1);
+    Value *DataSel = State.Builder.CreateSelect(AnyActive, DataUpdate, OldData,
+                                                "csa.data.sel");
+
+    if (Part == State.UF - 1)
+      DataPhi->addIncoming(DataSel, State.CFG.PrevBB);
+    State.set(this, DataSel, Part);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCSAExtractScalarRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = CSA-EXTRACT-SCALAR ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
+  IRBuilder<>::InsertPointGuard Guard(State.Builder);
+  State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+
+  unsigned LastPart = State.UF - 1;
+  Value *InitScalar = getVPInitScalar()->getLiveInIRValue();
+  Value *MaskSel = State.get(getVPMaskSel(), LastPart);
+  Value *DataSel = State.get(getVPDataSel(), LastPart);
+
+  Value *LastIdx = nullptr;
+  Value *IndexVec = State.Builder.CreateStepVector(
+      VectorType::get(State.Builder.getInt32Ty(), State.VF), "csa.step");
+  Value *NegOne = ConstantInt::get(IndexVec->getType()->getScalarType(), -1);
+  if (usesEVL()) {
+    // A vp.reduce.smax over the IndexVec with the MaskSel as the mask will
+    // give us the last active index into MaskSel, which gives us the correct
+    // index in the data vector to extract from. If no element in the mask
+    // is active, we pick -1. If we pick -1, then we will use the initial scalar
+    // value instead of extracting from the data vector.
+    Value *VL = State.get(getVPCSAVLSel(), LastPart, /*NeedsScalar=*/true);
+    LastIdx = State.Builder.CreateIntrinsic(NegOne->getType(),
+                                            Intrinsic::vp_reduce_smax,
+                                            {NegOne, IndexVec, MaskSel, VL});
+  } else {
+    // Get a vector where the elements are zero when the last active mask is
+    // false and the index in the vector when the mask is true.
+    Value *ActiveLaneIdxs = State.Builder.CreateSelect(
+        MaskSel, IndexVec, ConstantAggregateZero::get(IndexVec->getType()));
+    // Get the last active index in the mask. When no lanes in the mask are
+    // active, vector.umax will have value 0. Take the additional step to set
+    // LastIdx as -1 in this case to avoid the case of lane 0 of the mask being
+    // inactive, which would also cause the reduction to have value 0.
+    Value *MaybeLastIdx = State.Builder.CreateIntMaxReduce(ActiveLaneIdxs);
+    Value *IsLaneZeroActive =
+        State.Builder.CreateExtractElement(MaskSel, (uint64_t)0);
+    Value *Zero = ConstantInt::get(MaybeLastIdx->getType(), 0);
+    Value *MaybeLastIdxEQZero = State.Builder.CreateICmpEQ(MaybeLastIdx, Zero);
+    Value *And = State.Builder.CreateAnd(IsLaneZeroActive, MaybeLastIdxEQZero);
+    LastIdx = State.Builder.CreateSelect(And, Zero, NegOne);
+  }
+
+  Value *ExtractFromVec =
+      State.Builder.CreateExtractElement(DataSel, LastIdx, "csa.extract");
+  Value *Zero = ConstantInt::get(LastIdx->getType(), 0);
+  Value *LastIdxGEZero = State.Builder.CreateICmpSGE(LastIdx, Zero);
+  Value *ChooseFromVecOrInit =
+      State.Builder.CreateSelect(LastIdxGEZero, ExtractFromVec, InitScalar);
+  State.set(this, ChooseFromVecOrInit, 0, /*IsScalar=*/true);
+}
+
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Lane && "Branch on Mask works only on single instance.");
 
@@ -3237,3 +3475,30 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 #endif
+
+bool vputils::isPhi(const VPRecipeBase &R) {
+  if (R.isPhi())
+    return true;
+  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+  return false;
+}
+
+bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
+  if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
+    return true;
+  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+  return false;
+}
+
+bool vputils::isHeaderPhi(const VPRecipeBase &R) {
+  if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
+    return true;
+  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+  return false;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a878613c4ba483..0788b1faaeb0dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1349,6 +1349,45 @@ void VPlanTransforms::addActiveLaneMask(
     HeaderMask->replaceAllUsesWith(LaneMask);
 }
 
+/// Add recipes required to make CSA work with EVL based approach. This
+/// includes replacing \p CSAAnyActive with \p CSAAnyActiveEVL, and adding \p
+/// CSAVLPhi and \p CSAVLSel instructions.
+static void addExplicitVectorLengthForCSA(
+    VPValue &EVL, const MapVector<PHINode *, VPCSAState *> &CSAStates) {
+  for (auto &[_, CSAState] : CSAStates) {
+    // CSAAnyActive is used to keep track of whether any condition on the
+    // current iteration is active. This is used to decide whether the mask
+    // should be updated. When we are using EVL, we must only consider the first
+    // EVL number of elements in the mask. Replace CSAAnyActive with the EVL
+    // specific CSAAnyActiveEVL instruction.
+    auto *VPAnyActive = CSAState->getVPAnyActive();
+    auto *VPAnyActiveEVL = new VPInstruction(
+        VPInstruction::CSAAnyActiveEVL, {VPAnyActive->getOperand(0), &EVL},
+        VPAnyActive->getDebugLoc(), "csa.cond.anyactive");
+    VPAnyActiveEVL->insertBefore(VPAnyActive);
+    VPAnyActive->replaceAllUsesWith(VPAnyActiveEVL->getVPSingleValue());
+    VPAnyActive->eraseFromParent();
+    CSAState->setVPAnyActive(VPAnyActiveEVL);
+
+    // When we are using EVL, we must keep track of the most recent EVL when at
+    // least one lane in the mask was active. Imagine the scenario: on iteration
+    // N, there was at least one active lane in the mask. Then on all future
+    // iteration there was no active lanes in the mask. When it is time to
+    // extract the scalar from the data vector, we must use the EVL that
+    // corresponds to the EVL that was used when the mask vector was last
+    // updated. To do this, we introduce CSAVLPhi and CSAVLSel instructions
+    auto *VPVLPhi =
+        new VPInstruction(VPInstruction::CSAVLPhi, {}, {}, "csa.vl.phi");
+    auto *VPVLSel =
+        new VPInstruction(VPInstruction::CSAVLSel,
+                          {VPAnyActiveEVL, VPVLPhi, &EVL}, {}, "csa.vl.sel");
+    VPVLPhi->insertAfter(CSAState->getPhiRecipe());
+    VPVLSel->insertAfter(VPAnyActiveEVL);
+
+    CSAState->getExtractScalarRecipe()->addOperand(VPVLSel);
+  }
+}
+
 /// Replace recipes with their EVL variants.
 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
@@ -1404,6 +1443,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     }
     recursivelyDeleteDeadRecipes(HeaderMask);
   }
+
+  // We build the scalar version of a CSA when VF=ElementCount::getFixed(1),
+  // which does not require an EVL.
+  if (!Plan.hasScalarVFOnly())
+    addExplicitVectorLengthForCSA(EVL, Plan.getCSAStates());
 }
 
 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 4c383244f96f1a..0e3922b672ed68 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -359,6 +359,8 @@ class VPDef {
     VPWidenSelectSC,
     VPBlendSC,
     VPHistogramSC,
+    VPCSADataUpdateSC,
+    VPCSAExtractScalarSC,
     // START: Phi-like recipes. Need to be kept together.
     VPWidenPHISC,
     VPPredInstPHISC,
@@ -370,6 +372,7 @@ class VPDef {
     VPFirstOrderRecurrencePHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
+    VPCSAHeaderPHISC,
     VPReductionPHISC,
     // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
     // END: Phi-like recipes
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 99bc4c38a3c3cd..b4bc3de463de11 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -73,11 +73,11 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
   const VPRegionBlock *ParentR = VPBB->getParent();
   bool IsHeaderVPBB = ParentR && !ParentR->isReplicator() &&
                       ParentR->getEntryBasicBlock() == VPBB;
-  while (RecipeI != End && RecipeI->isPhi()) {
+  while (RecipeI != End && vputils::isPhi(*RecipeI)) {
     if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
       NumActiveLaneMaskPhiRecipes++;
 
-    if (IsHeaderVPBB && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(*RecipeI)) {
+    if (IsHeaderVPBB && !vputils::isHeaderPhi(*RecipeI)) {
       errs() << "Found non-header PHI recipe in header VPBB";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       errs() << ": ";
@@ -104,7 +104,7 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
   }
 
   while (RecipeI != End) {
-    if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
+    if (vputils::isPhi(*RecipeI) && !isa<VPBlendRecipe>(&*RecipeI)) {
       errs() << "Found phi-like recipe after non-phi recipe";
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
index 71a5519522a275..89a2dbadd3a2ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
 ; RUN:   -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN:   | FileCheck %s -check-prefix=EVL
+; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefix=EVL
 ; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
 ; RUN:   -passes=loop-vectorize -force-tail-folding-style=none \
-; RUN:   | FileCheck %s -check-prefix=NO-EVL
+; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefix=NO-EVL
 ; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
 ; RUN:   -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN:   | FileCheck %s -check-prefix=DATA
+; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefix=DATA
 
 ; This function is generated from the following C/C++ program:
 ; int simple_csa_int_select(int N, int *data, int a) {
@@ -25,24 +25,69 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_int_select(
 ; NO-EVL-NEXT:  entry:
@@ -50,24 +95,69 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; NO-EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; NO-EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 ; DATA-LABEL: @simple_csa_int_select(
 ; DATA-NEXT:  entry:
@@ -75,24 +165,69 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; DATA-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; DATA-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; DATA-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; DATA-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; DATA-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -139,24 +274,78 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
 ; NO-EVL-NEXT:  entry:
@@ -164,24 +353,78 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 ; DATA-LABEL: @simple_csa_int_select_induction_cmp(
 ; DATA-NEXT:  entry:
@@ -189,24 +432,78 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; DATA-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; DATA-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; DATA-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; DATA-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -253,23 +550,65 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; EVL-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; EVL-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret float [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_float_select(
 ; NO-EVL-NEXT:  entry:
@@ -277,23 +616,65 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; NO-EVL-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; NO-EVL-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; DATA-LABEL: @simple_csa_float_select(
 ; DATA-NEXT:  entry:
@@ -301,23 +682,65 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; DATA-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; DATA-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; DATA-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; DATA-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; DATA-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; DATA-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; DATA-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret float [[T_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
@@ -626,32 +1049,97 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; EVL-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; EVL-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; EVL-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; EVL-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret i32 [[OR]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @csa_in_series_int_select(
 ; NO-EVL-NEXT:  entry:
@@ -659,32 +1147,97 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-EVL-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; NO-EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; NO-EVL-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; NO-EVL-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; NO-EVL-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; NO-EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 ; DATA-LABEL: @csa_in_series_int_select(
 ; DATA-NEXT:  entry:
@@ -692,32 +1245,97 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; DATA-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; DATA-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; DATA-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; DATA-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; DATA-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; DATA-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; DATA-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; DATA-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; DATA-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; DATA-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; DATA-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; DATA-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; DATA-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; DATA-NEXT:    ret i32 [[OR]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
@@ -773,32 +1391,106 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; EVL-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
+; EVL-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
+; EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
+; EVL-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
+; EVL-NEXT:    [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
+; EVL-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
+; EVL-NEXT:    [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
+; EVL-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret i32 [[OR]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
 ; NO-EVL-NEXT:  entry:
@@ -806,32 +1498,106 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; NO-EVL-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; NO-EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
+; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
+; NO-EVL-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
+; NO-EVL-NEXT:    [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
+; NO-EVL-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
+; NO-EVL-NEXT:    [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
+; NO-EVL-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 ; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
 ; DATA-NEXT:  entry:
@@ -839,32 +1605,106 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; DATA-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
+; DATA-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; DATA-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; DATA-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; DATA-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
+; DATA-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; DATA-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
+; DATA-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
+; DATA-NEXT:    [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
+; DATA-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
+; DATA-NEXT:    [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
+; DATA-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; DATA-NEXT:    ret i32 [[OR]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
@@ -921,30 +1761,91 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; EVL-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret float [[ADD]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @csa_in_series_float_select(
 ; NO-EVL-NEXT:  entry:
@@ -952,30 +1853,91 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; NO-EVL-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; NO-EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; NO-EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret float [[ADD]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 ; DATA-LABEL: @csa_in_series_float_select(
 ; DATA-NEXT:  entry:
@@ -983,30 +1945,91 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; DATA-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; DATA-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; DATA-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; DATA-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; DATA-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; DATA-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; DATA-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; DATA-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; DATA-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; DATA-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; DATA-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; DATA-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; DATA-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; DATA-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
 ; DATA-NEXT:    ret float [[ADD]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 entry:
   %cmp19 = icmp sgt i32 %N, 0
@@ -3074,75 +4097,243 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; EVL:       for.body.preheader:
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
 ; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @idx_scalar(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; NO-EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 ; DATA-LABEL: @idx_scalar(
 ; DATA-NEXT:  entry:
 ; DATA-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; DATA-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; DATA:       for.body.preheader:
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; DATA-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; DATA-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; DATA-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; DATA-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; DATA-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
 ; DATA-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; DATA-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 entry:
   %cmp8.not = icmp eq i64 %n, 0
@@ -3186,75 +4377,273 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; EVL:       for.body.preheader:
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; EVL-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
+; EVL-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; EVL-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
+; EVL-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; EVL-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
+; EVL-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
+; EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
+; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; EVL-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; EVL-NEXT:    [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
+; EVL-NEXT:    [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
+; EVL-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
+; EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
+; EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; EVL-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
 ; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @idx_scalar_dec(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; NO-EVL-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; NO-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; NO-EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; NO-EVL-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
+; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
+; NO-EVL-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; NO-EVL-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
+; NO-EVL-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; NO-EVL-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
+; NO-EVL-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; NO-EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; NO-EVL-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 ; DATA-LABEL: @idx_scalar_dec(
 ; DATA-NEXT:  entry:
 ; DATA-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; DATA-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; DATA:       for.body.preheader:
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; DATA-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; DATA-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
+; DATA-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; DATA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
+; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; DATA-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; DATA-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; DATA-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
+; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
+; DATA-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; DATA-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
+; DATA-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
+; DATA-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
+; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; DATA-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; DATA-NEXT:    [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
+; DATA-NEXT:    [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; DATA-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
+; DATA-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
+; DATA-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; DATA-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
+; DATA-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; DATA-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; DATA-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; DATA-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; DATA-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
 ; DATA-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; DATA-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 entry:
   %cmp.not9 = icmp eq i64 %n, 0
@@ -3303,24 +4692,79 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
 ; NO-EVL-NEXT:  entry:
@@ -3328,24 +4772,79 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; DATA-LABEL: @simple_csa_int_select_neg_cond(
 ; DATA-NEXT:  entry:
@@ -3353,24 +4852,79 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; DATA-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; DATA-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; DATA-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -3417,25 +4971,80 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_ptr_select(
 ; NO-EVL-NEXT:  entry:
@@ -3443,25 +5052,80 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; NO-EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 ; DATA-LABEL: @simple_csa_ptr_select(
 ; DATA-NEXT:  entry:
@@ -3469,25 +5133,80 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; DATA:       for.body.preheader:
 ; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; DATA-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; DATA-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; DATA-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; DATA-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; DATA-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
 ; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; DATA-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0

>From 55ee9b993ae4ef068443b1a7c9efe6ad1251293a Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 20 Aug 2024 10:11:25 -0700
Subject: [PATCH 05/23] [VPlan] Add cost model for CSA

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  16 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  11 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 100 +++++++
 .../Transforms/LoopVectorize/RISCV/csa.ll     | 279 +++++++-----------
 4 files changed, 236 insertions(+), 170 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2f11543c564ed..d29a3347ff58ea 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7324,9 +7324,17 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
 /// not have corresponding recipes in \p Plan and are not marked to be ignored
 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
 /// cost-model did not account for.
-static bool planContainsAdditionalSimplifications(VPlan &Plan,
-                                                  VPCostContext &CostCtx,
-                                                  Loop *TheLoop) {
+static bool
+planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx,
+                                      Loop *TheLoop,
+                                      LoopVectorizationLegality &Legal) {
+  // CSA cost is more complicated since there is significant overhead in the
+  // preheader and middle block. It also contains recipes that are not backed by
+  // underlying instructions in the original loop. This makes it difficult to
+  // model in the legacy cost model.
+  if (!Legal.getCSAs().empty())
+    return true;
+
   // First collect all instructions for the recipes in Plan.
   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
@@ -7433,7 +7441,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   assert((BestFactor.Width == LegacyVF.Width ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
-                                                CostCtx, OrigLoop)) &&
+                                                CostCtx, OrigLoop, *Legal)) &&
          " VPlan cost model and legacy cost model disagreed");
   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
          "when vectorizing, the scalar cost must be computed.");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e4fc09a4d650fd..163f14de8bd3f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2653,6 +2653,9 @@ class VPCSAHeaderPHIRecipe final : public VPHeaderPHIRecipe {
 
   void execute(VPTransformState &State) override;
 
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2684,6 +2687,9 @@ class VPCSADataUpdateRecipe final : public VPSingleDefRecipe {
 
   void execute(VPTransformState &State) override;
 
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2730,6 +2736,9 @@ class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
 
   void execute(VPTransformState &State) override;
 
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2740,7 +2749,7 @@ class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
   VPValue *getVPMaskSel() const { return getOperand(1); }
   VPValue *getVPDataSel() const { return getOperand(2); }
   VPValue *getVPCSAVLSel() const { return getOperand(3); }
-  bool usesEVL() { return getNumOperands() == 4; }
+  bool usesEVL() const { return getNumOperands() == 4; }
 };
 
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8380e946a8ebb6..b85145d0073a31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2291,6 +2291,24 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
     State.set(this, DataPhi, Part);
 }
 
+InstructionCost VPCSAHeaderPHIRecipe::computeCost(ElementCount VF,
+                                                  VPCostContext &Ctx) const {
+  if (VF.isScalar())
+    return 0;
+
+  InstructionCost C = 0;
+  auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
+  const TargetTransformInfo &TTI = Ctx.TTI;
+
+  // FIXME: These costs should be moved into VPInstruction::computeCost. We put
+  // them here for now since there is no VPInstruction::computeCost support.
+  // CSAInitMask
+  C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VTy);
+  // CSAInitData
+  C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VTy);
+  return C;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPCSADataUpdateRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
@@ -2319,6 +2337,34 @@ void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
   }
 }
 
+InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
+  if (VF.isScalar())
+    return 0;
+
+  InstructionCost C = 0;
+  auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
+  auto *MaskTy = VectorType::get(IntegerType::getInt1Ty(VTy->getContext()), VF);
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  const TargetTransformInfo &TTI = Ctx.TTI;
+
+  // Data Update
+  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+
+  // FIXME: These costs should be moved into VPInstruction::computeCost. We put
+  // them here for now since they are related to updating the data and there is
+  // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
+  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+  // vp.reduce.or
+  C += TTI.getArithmeticReductionCost(Instruction::Or, VTy, std::nullopt,
+                                      CostKind);
+  // VPVLSel
+  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+  // MaskUpdate
+  C += TTI.getArithmeticInstrCost(Instruction::Select, MaskTy, CostKind);
+  return C;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPCSAExtractScalarRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
@@ -2379,6 +2425,60 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
   State.set(this, ChooseFromVecOrInit, 0, /*IsScalar=*/true);
 }
 
+InstructionCost
+VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
+                                      VPCostContext &Ctx) const {
+  if (VF.isScalar())
+    return 0;
+
+  InstructionCost C = 0;
+  auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
+  auto *Int32VTy =
+      VectorType::get(IntegerType::getInt32Ty(VTy->getContext()), VF);
+  auto *MaskTy = VectorType::get(IntegerType::getInt1Ty(VTy->getContext()), VF);
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  const TargetTransformInfo &TTI = Ctx.TTI;
+
+  // StepVector
+  ArrayRef<Value *> Args;
+  IntrinsicCostAttributes CostAttrs(Intrinsic::stepvector, Int32VTy, Args);
+  C += TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+  // NegOneSplat
+  C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, Int32VTy);
+  // LastIdx
+  if (usesEVL()) {
+    C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
+                                    CostKind);
+  } else {
+    // ActiveLaneIdxs
+    C += TTI.getArithmeticInstrCost(Instruction::Select,
+                                    MaskTy->getScalarType(), CostKind);
+    // MaybeLastIdx
+    C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
+                                    CostKind);
+    // IsLaneZeroActive
+    C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, MaskTy,
+                                    CostKind);
+    // MaybeLastIdxEQZero
+    C += TTI.getArithmeticInstrCost(Instruction::ICmp, MaskTy->getScalarType(),
+                                    CostKind);
+    // And
+    C += TTI.getArithmeticInstrCost(Instruction::And, MaskTy->getScalarType(),
+                                    CostKind);
+    // LastIdx
+    C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
+                                    CostKind);
+  }
+  // ExtractFromVec
+  C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
+  // LastIdxGeZero
+  C += TTI.getArithmeticInstrCost(Instruction::ICmp, Int32VTy, CostKind);
+  // ChooseFromVecOrInit
+  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
+                                  CostKind);
+  return C;
+}
+
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Lane && "Branch on Mask works only on single instance.");
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
index 89a2dbadd3a2ff..c90d88d912edaa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -4383,69 +4383,52 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
 ; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; EVL:       vector.body:
 ; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; EVL-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; EVL-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; EVL-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
-; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
-; EVL-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; EVL-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
-; EVL-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
-; EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
-; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
-; EVL-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; EVL-NEXT:    [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
-; EVL-NEXT:    [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; EVL:       middle.block:
-; EVL-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
-; EVL-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
-; EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
-; EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; EVL:       for.cond.cleanup:
 ; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -4455,10 +4438,10 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
 ; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
 ; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -4474,69 +4457,52 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; NO-EVL-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; NO-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; NO-EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; NO-EVL-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
-; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
-; NO-EVL-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; NO-EVL-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
-; NO-EVL-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
-; NO-EVL-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; NO-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; NO-EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; NO-EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
-; NO-EVL-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; NO-EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
-; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; NO-EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; NO-EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; NO-EVL:       for.cond.cleanup:
 ; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -4546,10 +4512,10 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; NO-EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
 ; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -4565,69 +4531,52 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; DATA-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
 ; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DATA:       vector.body:
 ; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; DATA-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; DATA-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; DATA-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; DATA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
-; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
-; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; DATA-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; DATA-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; DATA-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
-; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
-; DATA-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; DATA-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
-; DATA-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
-; DATA-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
-; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
-; DATA-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; DATA-NEXT:    [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
-; DATA-NEXT:    [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; DATA-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; DATA-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; DATA-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; DATA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; DATA-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
 ; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; DATA-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; DATA-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; DATA:       middle.block:
-; DATA-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
-; DATA-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
-; DATA-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; DATA-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
-; DATA-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; DATA-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; DATA-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; DATA-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; DATA-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; DATA-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; DATA:       scalar.ph:
 ; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; DATA-NEXT:    br label [[FOR_BODY:%.*]]
 ; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; DATA:       for.cond.cleanup:
 ; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -4637,10 +4586,10 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; DATA-NEXT:    [[SUB]] = add i64 [[I_011]], -1
 ; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; DATA-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
 ; DATA-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; DATA-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
 ; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]

>From bfedfe3f75dfd1859b3e8f10f46c59ff2ea0ee80 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 08:48:08 -0700
Subject: [PATCH 06/23] fixup! repond to @artagnon initial set of review

---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 13 ---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 26 ------------------
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  | 27 +++++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    | 10 +++++++
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  1 +
 .../Transforms/LoopVectorize/RISCV/csa.ll     |  8 +++---
 6 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 163f14de8bd3f2..20b815a813bcaf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4185,19 +4185,6 @@ class VPlanSlp {
   bool isCompletelySLP() const { return CompletelySLP; }
 };
 
-namespace vputils {
-
-/// Returns true for PHI-like recipes.
-bool isPhi(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that generate their own backedge
-bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that exists in vector loop header basic
-/// block
-bool isHeaderPhi(const VPRecipeBase &R);
-} // end namespace vputils
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b85145d0073a31..7fc258e15579df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3576,29 +3576,3 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-bool vputils::isPhi(const VPRecipeBase &R) {
-  if (R.isPhi())
-    return true;
-  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
-    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
-           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
-  return false;
-}
-
-bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
-  if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
-    return true;
-  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
-    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
-           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
-  return false;
-}
-
-bool vputils::isHeaderPhi(const VPRecipeBase &R) {
-  if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
-    return true;
-  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
-    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
-           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
-  return false;
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4621c28b051298..0a6e58f87f7ad4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -115,3 +115,30 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
         return false;
       });
 }
+
+bool vputils::isPhi(const VPRecipeBase &R) {
+  if (R.isPhi())
+    return true;
+  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+  return false;
+}
+
+bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
+  if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
+    return true;
+  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+  return false;
+}
+
+bool vputils::isHeaderPhi(const VPRecipeBase &R) {
+  if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
+    return true;
+  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+  return false;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 96577700205213..40cf8b3e87f122 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -61,6 +61,16 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan);
 /// VPDerivedIV or VPCanonicalIVPHI).
 bool isUniformAcrossVFsAndUFs(VPValue *V);
 
+/// Returns true for PHI-like recipes.
+bool isPhi(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that generate their own backedge
+bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that exists in vector loop header basic
+/// block
+bool isHeaderPhi(const VPRecipeBase &R);
+
 } // end namespace llvm::vputils
 
 #endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index b4bc3de463de11..baafa70b9effbf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -15,6 +15,7 @@
 #include "VPlanVerifier.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
+#include "VPlanUtils.h"
 #include "VPlanDominatorTree.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
index c90d88d912edaa..e67002e2b2d905 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -259,7 +259,7 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 ; This function is generated from the following C/C++ program:
-; int simple_csa_int_select(int N, int *data) {
+; int simple_csa_int_select_induction_cmp(int N, int *data) {
 ;   int t = -1;
 ;   for (int i = 0; i < N; i++) {
 ;     if (i < data[i])
@@ -1373,13 +1373,13 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 ; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select(int N, int *data0, int *data1) {
+; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
 ;   int t = -1;
 ;   int s = -1;
 ;   for (int i = 0; i < N; i++) {
-;     if (a < data0[i])
+;     if (i < data0[i])
 ;       t = data0[i];
-;     if (a < data1[i])
+;     if (i < data1[i])
 ;       s = data1[i];
 ;   }
 ;   return t | s; // use t and s

>From 354cabbd2d30d69e68d75218a3d270cd85300756 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 10:33:17 -0700
Subject: [PATCH 07/23] fixup! reorganize tests

---
 .../RISCV/conditional-scalar-assignment.ll    |  833 +++
 .../Transforms/LoopVectorize/RISCV/csa.ll     | 5188 -----------------
 .../conditional-scalar-assignment.ll          | 3091 ++++++++++
 3 files changed, 3924 insertions(+), 5188 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
new file mode 100644
index 00000000000000..6d7816800603e2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -0,0 +1,833 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN:   | FileCheck %s -check-prefix=EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
+; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN:   | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
+; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN:   | FileCheck %s -check-prefix=DATA
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+;   uint64_t idx = ii;
+;   for (uint64_t i = 0; i < n; ++i)
+;     idx = (a[i] > b[i]) ? i : idx;
+;   return idx;
+; }
+define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-EVL-LABEL: @idx_scalar(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; NO-EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; DATA-LABEL: @idx_scalar(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; DATA-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; DATA-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; DATA-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; DATA-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; DATA-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; DATA-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp8.not = icmp eq i64 %n, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %cond.lcssa = phi i64 [ %cond, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i64 %idx.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
+  %1 = load i64, ptr %arrayidx1, align 8
+  %cmp2 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
+  %inc = add nuw i64 %i.010, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+;   uint64_t idx = ii;
+;   for (uint64_t i = n; i > 0; --i) // decreasing
+;      idx = (a[i - 1] > b[i - 1]) ? i : idx;
+;   return idx;
+; }
+define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
+; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; NO-EVL-LABEL: @idx_scalar_dec(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; NO-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; NO-EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; NO-EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; NO-EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; NO-EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
+; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; DATA-LABEL: @idx_scalar_dec(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; DATA-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; DATA-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; DATA-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; DATA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; DATA-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; DATA-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; DATA-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; DATA-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; DATA-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; DATA-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; DATA-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; DATA-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; DATA-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
+; DATA-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; DATA-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  %cmp.not9 = icmp eq i64 %n, 0
+  br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %cond.lcssa = phi i64 [ %cond, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i64 %idx.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
+  %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+  %sub = add i64 %i.011, -1
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
+  %1 = load i64, ptr %arrayidx2, align 8
+  %cmp3 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
+  %cmp.not = icmp eq i64 %sub, 0
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+;   int *t = nullptr;
+;   for (int i = 0; i < N; i++) {
+;     if (a < *data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_ptr_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; NO-EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_ptr_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; DATA-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; DATA-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; DATA-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; DATA-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; DATA-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; DATA-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret ptr %t.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+  %0 = load ptr, ptr %arrayidx, align 8
+  %1 = load i32, ptr %0, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %2
+  %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
deleted file mode 100644
index e67002e2b2d905..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ /dev/null
@@ -1,5188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
-; RUN:   -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefix=EVL
-; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
-; RUN:   -passes=loop-vectorize -force-tail-folding-style=none \
-; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefix=NO-EVL
-; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
-; RUN:   -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefix=DATA
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int_select(int N, int *data, int a) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (a < data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
-; EVL-LABEL: @simple_csa_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
-; EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; NO-EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; NO-EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
-; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; DATA-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; DATA-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; DATA-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; DATA-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
-; DATA-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %a, %1
-  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int_select_induction_cmp(int N, int *data) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (i < data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_induction_cmp(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; DATA-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; DATA-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; DATA-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; DATA-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %1
-  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float simple_csa_float_select(int N, float *data) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (0.0f < data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define float @simple_csa_float_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; EVL-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
-; EVL-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
-; EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
-; EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; NO-EVL-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
-; NO-EVL-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
-; NO-EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_float_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
-; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; DATA-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; DATA-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
-; DATA-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; DATA-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
-; DATA-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
-; DATA-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret float [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
-; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-entry:
-  %cmp8 = icmp sgt i32 %N, 0
-  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 0.000000e+00
-  %t.1 = select i1 %cmp1, float %0, float %t.09
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int(int N, bool *cond, int *data) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
-; EVL-LABEL: @simple_csa_int(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @simple_csa_int(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @simple_csa_int(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float simple_csa_float(int N, bool *cond, float *data) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
-; EVL-LABEL: @simple_csa_float(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @simple_csa_float(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @simple_csa_float(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret float [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (a < data0[i])
-;       t = data0[i];
-;     if (a < data1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
-; EVL-LABEL: @csa_in_series_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; EVL-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
-; EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
-; EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
-; EVL-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
-; EVL-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
-; EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
-; EVL-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
-; EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; NO-EVL-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; NO-EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
-; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
-; NO-EVL-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
-; NO-EVL-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
-; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
-; NO-EVL-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
-; NO-EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; DATA-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; DATA-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; DATA-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
-; DATA-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
-; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; DATA-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; DATA-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; DATA-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
-; DATA-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
-; DATA-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
-; DATA-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
-; DATA-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
-; DATA-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
-; DATA-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = or i32 %s.1, %spec.select
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx, align 4
-  %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %a, %2
-  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %3 = load i32, ptr %arrayidx5, align 4
-  %4 = sext i32 %3 to i64
-  %cmp6 = icmp slt i64 %a, %4
-  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (i < data0[i])
-;       t = data0[i];
-;     if (i < data1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
-; EVL-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
-; EVL-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
-; EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
-; EVL-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
-; EVL-NEXT:    [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
-; EVL-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
-; EVL-NEXT:    [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
-; EVL-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
-; NO-EVL-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
-; NO-EVL-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; NO-EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; NO-EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
-; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
-; NO-EVL-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
-; NO-EVL-NEXT:    [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
-; NO-EVL-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
-; NO-EVL-NEXT:    [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
-; NO-EVL-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
-; DATA-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
-; DATA-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
-; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; DATA-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; DATA-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; DATA-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
-; DATA-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; DATA-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
-; DATA-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
-; DATA-NEXT:    [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
-; DATA-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
-; DATA-NEXT:    [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
-; DATA-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = or i32 %s.1, %spec.select
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx, align 4
-  %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %2
-  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %3 = load i32, ptr %arrayidx5, align 4
-  %4 = sext i32 %3 to i64
-  %cmp6 = icmp slt i64 %indvars.iv, %4
-  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_float_select(int N, float *data0,
-;                           float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (0.0f < data0[i])
-;       t = data0[i];
-;     if (0.0f <data1[i])
-;       s = data1[i];
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
-; EVL-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
-; EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
-; EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
-; EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret float [[ADD]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
-; NO-EVL-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; NO-EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; NO-EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
-; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; NO-EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; NO-EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
-; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret float [[ADD]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_float_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; DATA-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; DATA-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
-; DATA-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
-; DATA-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
-; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; DATA-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; DATA-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; DATA-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
-; DATA-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; DATA-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; DATA-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; DATA-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; DATA-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
-; DATA-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; DATA-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret float [[ADD]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
-; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-entry:
-  %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp ogt float %1, 0.000000e+00
-  %t.1 = select i1 %cmp1, float %1, float %t.020
-  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx5, align 4
-  %cmp6 = fcmp ogt float %2, 0.000000e+00
-  %s.1 = select i1 %cmp6, float %2, float %s.021
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[IF_END]]
-; EVL:       if.end:
-; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL:       if.then6:
-; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_int(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[IF_END]]
-; NO-EVL:       if.end:
-; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL:       if.then6:
-; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_int(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[IF_END]]
-; DATA:       if.end:
-; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA:       if.then6:
-; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  %0 = or i32 %s.1, %t.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %4 = load i32, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
-;                           float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret float [[ADD]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[IF_END]]
-; EVL:       if.end:
-; EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL:       if.then6:
-; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_float(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret float [[ADD]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[IF_END]]
-; NO-EVL:       if.end:
-; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL:       if.then6:
-; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_float(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret float [[ADD]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[IF_END]]
-; DATA:       if.end:
-; DATA-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA:       if.then6:
-; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
-  %4 = load float, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_same_scalar_int_select(int N, int *data0,
-;                                   int *data1) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (i < data0[i])
-;       t = data0[i];
-;     if (i < data1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; NO-EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; NO-EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_int_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; DATA-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; DATA-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %1
-  %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx5, align 4
-  %3 = sext i32 %2 to i64
-  %cmp6 = icmp slt i64 %indvars.iv, %3
-  %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_same_scalar_float_select(int N,
-;                                       float *data0, float *data1) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (0.0f < data0[i])
-;       t = data0[i];
-;     if (0.0f < data1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; EVL-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_float_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret float [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; DATA-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; DATA-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 0.000000e+00
-  %t.1 = select i1 %cmp1, float %0, float %t.020
-  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx5, align 4
-  %cmp6 = fcmp ogt float %1, 0.000000e+00
-  %t.2 = select i1 %cmp6, float %1, float %t.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
-;                                   int *data1) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_int(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[IF_END]]
-; EVL:       if.end:
-; EVL-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL:       if.then6:
-; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[IF_END]]
-; NO-EVL:       if.end:
-; NO-EVL-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL:       if.then6:
-; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_int(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[IF_END]]
-; DATA:       if.end:
-; DATA-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA:       if.then6:
-; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %2 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %2, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %3 = load i32, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
-;                                       float *data0, float *data1) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_float(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[IF_END]]
-; EVL:       if.end:
-; EVL-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL:       if.then6:
-; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[IF_END]]
-; NO-EVL:       if.end:
-; NO-EVL-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL:       if.then6:
-; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_float(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret float [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[IF_END]]
-; DATA:       if.end:
-; DATA-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA:       if.then6:
-; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %2 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %2, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
-  %3 = load float, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i]) {
-;       t = data0[i];
-;       s = data1[i];
-;     }
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_same_cond_int(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_same_cond_int(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_same_cond_int(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  %0 = or i32 %s.1, %t.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %3 = load i32, ptr %arrayidx4, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
-  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i]) {
-;       t = data0[i];
-;       s = data1[i];
-;     }
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_same_cond_float(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret float [[ADD]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_same_cond_float(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret float [[ADD]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_same_cond_float(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret float [[ADD]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
-  %3 = load float, ptr %arrayidx4, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
-  %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
-;                                 int *data1) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_same_scalar_int(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; EVL:       if.else:
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; EVL:       for.inc.sink.split:
-; EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; NO-EVL:       if.else:
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; NO-EVL:       for.inc.sink.split:
-; NO-EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_same_scalar_int(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; DATA:       if.else:
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; DATA:       for.inc.sink.split:
-; DATA-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %1, 0
-  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
-
-for.inc.sink.split:                               ; preds = %if.else, %for.body
-  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.inc.sink.split, %if.else
-  %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
-;                                     float *data0, float *data1) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_same_scalar_float(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; EVL:       if.else:
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; EVL:       for.inc.sink.split:
-; EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; NO-EVL:       if.else:
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; NO-EVL:       for.inc.sink.split:
-; NO-EVL-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_same_scalar_float(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret float [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; DATA:       if.else:
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; DATA:       for.inc.sink.split:
-; DATA-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %1, 0
-  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
-
-for.inc.sink.split:                               ; preds = %if.else, %for.body
-  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
-  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.inc.sink.split, %if.else
-  %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_int(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       if.else:
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL:       if.then6:
-; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; EVL-NEXT:    [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_int(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       if.else:
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL:       if.then6:
-; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; NO-EVL-NEXT:    [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_int(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       if.else:
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA:       if.then6:
-; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; DATA-NEXT:    [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  %0 = or i32 %s.1, %t.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.else, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.else
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
-  %4 = load i32, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then6, %if.else
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
-  %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
-;                         float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_float(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret float [[ADD]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; EVL:       if.then:
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       if.else:
-; EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL:       if.then6:
-; EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT:    br label [[FOR_INC]]
-; EVL:       for.inc:
-; EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; EVL-NEXT:    [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_float(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret float [[ADD]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL:       if.then:
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       if.else:
-; NO-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL:       if.then6:
-; NO-EVL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT:    br label [[FOR_INC]]
-; NO-EVL:       for.inc:
-; NO-EVL-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; NO-EVL-NEXT:    [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_float(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret float [[ADD]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; DATA:       if.then:
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       if.else:
-; DATA-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT:    br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA:       if.then6:
-; DATA-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT:    br label [[FOR_INC]]
-; DATA:       for.inc:
-; DATA-NEXT:    [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; DATA-NEXT:    [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.else, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx2, align 4
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.else
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
-  %4 = load float, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then6, %if.else
-  %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
-  %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
-;   uint64_t idx = ii;
-;   for (uint64_t i = 0; i < n; ++i)
-;     idx = (a[i] > b[i]) ? i : idx;
-;   return idx;
-; }
-define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; NO-EVL-LABEL: @idx_scalar(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; NO-EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; DATA-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; DATA-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; DATA-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; DATA-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; DATA-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; DATA-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-entry:
-  %cmp8.not = icmp eq i64 %n, 0
-  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %cond.lcssa = phi i64 [ %cond, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i64 %idx.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
-  %0 = load i64, ptr %arrayidx, align 8
-  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
-  %1 = load i64, ptr %arrayidx1, align 8
-  %cmp2 = icmp sgt i64 %0, %1
-  %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
-  %inc = add nuw i64 %i.010, 1
-  %exitcond.not = icmp eq i64 %inc, %n
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
-;   uint64_t idx = ii;
-;   for (uint64_t i = n; i > 0; --i) // decreasing
-;      idx = (a[i - 1] > b[i - 1]) ? i : idx;
-;   return idx;
-; }
-define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar_dec(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-; NO-EVL-LABEL: @idx_scalar_dec(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; NO-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; NO-EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; NO-EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; NO-EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; NO-EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar_dec(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; DATA-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; DATA-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; DATA-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; DATA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; DATA-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; DATA-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; DATA-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; DATA-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; DATA-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; DATA-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; DATA-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; DATA-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-entry:
-  %cmp.not9 = icmp eq i64 %n, 0
-  br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %cond.lcssa = phi i64 [ %cond, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i64 %idx.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
-  %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
-  %sub = add i64 %i.011, -1
-  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
-  %0 = load i64, ptr %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
-  %1 = load i64, ptr %arrayidx2, align 8
-  %cmp3 = icmp sgt i64 %0, %1
-  %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
-  %cmp.not = icmp eq i64 %sub, 0
-  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; The key part of this function is that the true arm of the select corresponds
-; to selecting the initial value, instead of selecting the new value.
-; int simple_csa_int_select_neg_cond(int N, int *data) {
-;   int t = 0;
-;   for (int i = 0; i < N; i++) {
-;     if (i != data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_neg_cond(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_neg_cond(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; DATA-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; DATA-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = zext i32 %0 to i64
-  %cmp1.not = icmp eq i64 %indvars.iv, %1
-  %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int *simple_csa_ptr_select(int N, int **data) {
-;   int *t = nullptr;
-;   for (int i = 0; i < N; i++) {
-;     if (i < *data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_ptr_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_ptr_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; NO-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; NO-EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_ptr_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; DATA-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; DATA-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; DATA-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; DATA-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; DATA-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; DATA-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
-  ret ptr %t.0.lcssa
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
-  %0 = load ptr, ptr %arrayidx, align 8
-  %1 = load i32, ptr %0, align 4
-  %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %2
-  %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
new file mode 100644
index 00000000000000..0269f0c672a3fa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -0,0 +1,3091 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
+; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
+; RUN:   | FileCheck %s -check-prefix=EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
+; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
+; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
+; RUN:   | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
+; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
+; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
+; RUN:   | FileCheck %s -check-prefix=DATA
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data, int a) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
+; EVL-LABEL: @simple_csa_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
+; EVL-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
+; EVL-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
+; EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
+; NO-EVL-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
+; NO-EVL-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; NO-EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_int_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; DATA-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; DATA-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
+; DATA-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
+; DATA-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; DATA-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
+; DATA-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; DATA-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %a, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select_induction_cmp(int N, int *data) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; NO-EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_int_select_induction_cmp(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; DATA-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; DATA-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; DATA-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; DATA-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; DATA-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; DATA-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; DATA-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; DATA-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float_select(int N, float *data) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define float @simple_csa_float_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
+; EVL-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
+; EVL-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; EVL-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
+; EVL-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
+; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
+; NO-EVL-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
+; NO-EVL-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
+; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_float_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; DATA-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
+; DATA-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
+; DATA-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; DATA-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
+; DATA-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
+; DATA-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret float [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
+; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %0, float %t.09
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int(int N, bool *cond, int *data) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @simple_csa_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @simple_csa_int(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float(int N, bool *cond, float *data) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @simple_csa_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @simple_csa_float(
+; NO-EVL-NOT: vector.body:
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data0[i])
+;       t = data0[i];
+;     if (a < data1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
+; EVL-LABEL: @csa_in_series_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
+; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
+; EVL-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
+; EVL-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
+; EVL-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
+; EVL-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; NO-EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
+; NO-EVL-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
+; NO-EVL-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; DATA-LABEL: @csa_in_series_int_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; DATA-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; DATA-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; DATA-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; DATA-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; DATA-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
+; DATA-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; DATA-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
+; DATA-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
+; DATA-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+; DATA-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
+; DATA-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
+; DATA-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = or i32 %s.1, %spec.select
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %a, %2
+  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx5, align 4
+  %4 = sext i32 %3 to i64
+  %cmp6 = icmp slt i64 %a, %4
+  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data0[i])
+;       t = data0[i];
+;     if (i < data1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
+; EVL-NEXT:    [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
+; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
+; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
+; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
+; EVL-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
+; EVL-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
+; EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
+; EVL-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
+; EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
+; NO-EVL-NEXT:    [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
+; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
+; NO-EVL-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
+; NO-EVL-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
+; NO-EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
+; NO-EVL-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
+; NO-EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; DATA-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; DATA-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; DATA-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
+; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; DATA-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
+; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
+; DATA-NEXT:    [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
+; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
+; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
+; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
+; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
+; DATA-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
+; DATA-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
+; DATA-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
+; DATA-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
+; DATA-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret i32 [[OR]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = or i32 %s.1, %spec.select
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %2
+  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx5, align 4
+  %4 = sext i32 %3 to i64
+  %cmp6 = icmp slt i64 %indvars.iv, %4
+  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float_select(int N, float *data0,
+;                           float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data0[i])
+;       t = data0[i];
+;     if (0.0f <data1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
+; EVL-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
+; EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
+; EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
+; EVL-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
+; EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret float [[ADD]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
+; NO-EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; NO-EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
+; NO-EVL-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret float [[ADD]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+; DATA-LABEL: @csa_in_series_float_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; DATA-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
+; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
+; DATA-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
+; DATA-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
+; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
+; DATA-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; DATA-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
+; DATA-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
+; DATA-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; DATA-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
+; DATA-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; DATA-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT:    ret float [[ADD]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
+; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
+; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
+; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+entry:
+  %cmp19 = icmp sgt i32 %N, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %1, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %1, float %t.020
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx5, align 4
+  %cmp6 = fcmp ogt float %2, 0.000000e+00
+  %s.1 = select i1 %cmp6, float %2, float %s.021
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_int(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %4 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
+;                           float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_float(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %4 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int_select(int N, int *data0,
+;                                   int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data0[i])
+;       t = data0[i];
+;     if (i < data1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int_select(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %indvars.iv, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx5, align 4
+  %3 = sext i32 %2 to i64
+  %cmp6 = icmp slt i64 %indvars.iv, %3
+  %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float_select(int N,
+;                                       float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data0[i])
+;       t = data0[i];
+;     if (0.0f < data1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float_select(
+; NO-EVL-NOT: vector.body:
+;
+entry:
+  %cmp19 = icmp sgt i32 %N, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %0, float %t.020
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx5, align 4
+  %cmp6 = fcmp ogt float %1, 0.000000e+00
+  %t.2 = select i1 %cmp6, float %1, float %t.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+;                                   int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %2, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
+;                                       float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %2, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %3 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i]) {
+;       t = data0[i];
+;       s = data1[i];
+;     }
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_same_cond_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_same_cond_int(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
+  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i]) {
+;       t = data0[i];
+;       s = data1[i];
+;     }
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_same_cond_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_same_cond_float(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %3 = load float, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
+  %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+;                                 int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_same_scalar_int(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %1, 0
+  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split:                               ; preds = %if.else, %for.body
+  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.sink.split, %if.else
+  %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
+;                                     float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_same_scalar_float(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %1, 0
+  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split:                               ; preds = %if.else, %for.body
+  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.sink.split, %if.else
+  %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_int(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.else
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %4 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then6, %if.else
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+  %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
+;                         float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_float(
+; EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_float(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.else
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %4 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then6, %if.else
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+  %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+;   uint64_t idx = ii;
+;   for (uint64_t i = 0; i < n; ++i)
+;     idx = (a[i] > b[i]) ? i : idx;
+;   return idx;
+; }
+define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @idx_scalar(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @idx_scalar(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp8.not = icmp eq i64 %n, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %cond.lcssa = phi i64 [ %cond, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i64 %idx.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
+  %1 = load i64, ptr %arrayidx1, align 8
+  %cmp2 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
+  %inc = add nuw i64 %i.010, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+;   uint64_t idx = ii;
+;   for (uint64_t i = n; i > 0; --i) // decreasing
+;      idx = (a[i - 1] > b[i - 1]) ? i : idx;
+;   return idx;
+; }
+define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @idx_scalar_dec(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @idx_scalar_dec(
+; DATA-NOT: vector.body:
+;
+entry:
+  %cmp.not9 = icmp eq i64 %n, 0
+  br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %cond.lcssa = phi i64 [ %cond, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i64 %idx.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
+  %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+  %sub = add i64 %i.011, -1
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
+  %1 = load i64, ptr %arrayidx2, align 8
+  %cmp3 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
+  %cmp.not = icmp eq i64 %sub, 0
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; The key part of this function is that the true arm of the select corresponds
+; to selecting the initial value, instead of selecting the new value.
+; int simple_csa_int_select_neg_cond(int N, int *data) {
+;   int t = 0;
+;   for (int i = 0; i < N; i++) {
+;     if (i != data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_neg_cond(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT:    [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; EVL-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
+; EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
+; EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
+; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT:    [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
+; NO-EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
+; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_int_select_neg_cond(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA:       vector.ph:
+; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; DATA-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; DATA-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; DATA-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DATA:       vector.body:
+; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT:    [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; DATA-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
+; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; DATA:       middle.block:
+; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT:    [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
+; DATA-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; DATA-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; DATA-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
+; DATA-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; DATA-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA:       scalar.ph:
+; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
+; DATA-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = zext i32 %0 to i64
+  %cmp1.not = icmp eq i64 %indvars.iv, %1
+  %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+;   int *t = nullptr;
+;   for (int i = 0; i < N; i++) {
+;     if (a < *data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
+; EVL-LABEL: @simple_csa_ptr_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       for.cond.cleanup.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; EVL:       for.cond.cleanup:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-EVL:       for.cond.cleanup:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_ptr_select(
+; DATA-NEXT:  entry:
+; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA:       for.body.preheader:
+; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT:    br label [[FOR_BODY:%.*]]
+; DATA:       for.cond.cleanup.loopexit:
+; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DATA:       for.cond.cleanup:
+; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
+; DATA:       for.body:
+; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret ptr %t.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+  %0 = load ptr, ptr %arrayidx, align 8
+  %1 = load i32, ptr %0, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %a, %2
+  %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From edd4a8d697d6780fdd11d6e622520a08a6d18f7f Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 11:01:18 -0700
Subject: [PATCH 08/23] fixup! clang-format

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  | 1 -
 llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7fc258e15579df..c39084d2906a32 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3575,4 +3575,3 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 #endif
-
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index baafa70b9effbf..2b9c32624a2916 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -15,8 +15,8 @@
 #include "VPlanVerifier.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
-#include "VPlanUtils.h"
 #include "VPlanDominatorTree.h"
+#include "VPlanUtils.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"

>From 46cbe10451d49484a838d60fe785713ad41dbf97 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 18:12:33 -0700
Subject: [PATCH 09/23] fixup! respond to some more reviews

---
 llvm/include/llvm/Analysis/CSADescriptors.h   |  2 +-
 llvm/lib/Analysis/CSADescriptors.cpp          | 41 ++++++++++---------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  1 -
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 ++--
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/Analysis/CSADescriptors.h b/llvm/include/llvm/Analysis/CSADescriptors.h
index edd98777d84ab6..75372c1ba93d8a 100644
--- a/llvm/include/llvm/Analysis/CSADescriptors.h
+++ b/llvm/include/llvm/Analysis/CSADescriptors.h
@@ -1,4 +1,4 @@
-//===- llvm/Analysis/CSADescriptors.h - CSA Descriptors --*- C++ -*-===//
+//===------------- CSADescriptors.h - CSA Descriptors -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Analysis/CSADescriptors.cpp b/llvm/lib/Analysis/CSADescriptors.cpp
index d0377c8c16de33..1f4ac5ef8b2516 100644
--- a/llvm/lib/Analysis/CSADescriptors.cpp
+++ b/llvm/lib/Analysis/CSADescriptors.cpp
@@ -1,4 +1,4 @@
-//=== llvm/Analysis/CSADescriptors.cpp - CSA Descriptors -*- C++ -*-===//
+//===----------- CSADescriptors..cpp - CSA Descriptors ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,43 +19,43 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "csa-descriptors"
 
+/// Return CSADescriptor that describes a CSA that matches one of these
+/// patterns:
+///   phi loop_inv, (select cmp, value, phi)
+///   phi loop_inv, (select cmp, phi, value)
+///   phi (select cmp, value, phi), loop_inv
+///   phi (select cmp, phi, value), loop_inv
+/// If the CSA does not match any of these paterns, return a CSADescriptor
+/// that describes an InvalidCSA.
 CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
-  // Return CSADescriptor that describes a CSA that matches one of these
-  // patterns:
-  //   phi loop_inv, (select cmp, value, phi)
-  //   phi loop_inv, (select cmp, phi, value)
-  //   phi (select cmp, value, phi), loop_inv
-  //   phi (select cmp, phi, value), loop_inv
-  // If the CSA does not match any of these paterns, return a CSADescriptor
-  // that describes an InvalidCSA.
 
   // Must be a scalar
   Type *Type = Phi->getType();
   if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
       !Type->isPointerTy())
-    return CSADescriptor();
+    return {};
 
   // Match phi loop_inv, (select cmp, value, phi)
   //    or phi loop_inv, (select cmp, phi, value)
   //    or phi (select cmp, value, phi), loop_inv
   //    or phi (select cmp, phi, value), loop_inv
   if (Phi->getNumIncomingValues() != 2)
-    return CSADescriptor();
-  auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](Use &U) {
+    return {};
+  auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](const Use &U) {
     return match(U.get(), m_Select(m_Value(), m_Specific(Phi), m_Value())) ||
            match(U.get(), m_Select(m_Value(), m_Value(), m_Specific(Phi)));
   });
   if (SelectInstIt == Phi->incoming_values().end())
-    return CSADescriptor();
+    return {};
   auto LoopInvIt = find_if(Phi->incoming_values(), [&](Use &U) {
     return U.get() != *SelectInstIt && TheLoop->isLoopInvariant(U.get());
   });
   if (LoopInvIt == Phi->incoming_values().end())
-    return CSADescriptor();
+    return {};
 
   // Phi or Sel must be used only outside the loop,
   // excluding if Phi use Sel or Sel use Phi
-  auto IsOnlyUsedOutsideLoop = [=](Value *V, Value *Ignore) {
+  auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
     return all_of(V->users(), [Ignore, TheLoop](User *U) {
       if (U == Ignore)
         return true;
@@ -64,10 +64,11 @@ CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
       return true;
     });
   };
-  auto *Sel = cast<SelectInst>(SelectInstIt->get());
-  auto *LoopInv = LoopInvIt->get();
-  if (!IsOnlyUsedOutsideLoop(Phi, Sel) || !IsOnlyUsedOutsideLoop(Sel, Phi))
-    return CSADescriptor();
+  Instruction *Select = cast<SelectInst>(SelectInstIt->get());
+  Value *LoopInv = LoopInvIt->get();
+  if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+      !IsOnlyUsedOutsideLoop(Select, Phi))
+    return {};
 
-  return CSADescriptor(Phi, Sel, LoopInv);
+  return {Phi, Select, LoopInv};
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 20b815a813bcaf..a4ac07259b87fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4184,7 +4184,6 @@ class VPlanSlp {
   /// Return true if all visited instruction can be combined.
   bool isCompletelySLP() const { return CompletelySLP; }
 };
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c39084d2906a32..33aaafd5b66357 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -680,8 +680,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
       State.set(this, InitMask, Part);
       return InitMask;
     }
-    Value *V = State.get(this, Part - 1);
-    return V;
+    return State.get(this, Part - 1);
   }
   case VPInstruction::CSAInitData: {
     if (Part == 0) {
@@ -690,8 +689,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
       State.set(this, InitData, Part);
       return InitData;
     }
-    Value *V = State.get(this, Part - 1);
-    return V;
+    return State.get(this, Part - 1);
   }
   case VPInstruction::CSAMaskPhi: {
     if (Part == 0) {
@@ -2409,7 +2407,7 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
     // inactive, which would also cause the reduction to have value 0.
     Value *MaybeLastIdx = State.Builder.CreateIntMaxReduce(ActiveLaneIdxs);
     Value *IsLaneZeroActive =
-        State.Builder.CreateExtractElement(MaskSel, (uint64_t)0);
+        State.Builder.CreateExtractElement(MaskSel, static_cast<uint64_t>(0));
     Value *Zero = ConstantInt::get(MaybeLastIdx->getType(), 0);
     Value *MaybeLastIdxEQZero = State.Builder.CreateICmpEQ(MaybeLastIdx, Zero);
     Value *And = State.Builder.CreateAnd(IsLaneZeroActive, MaybeLastIdxEQZero);

>From 76dd05978cb433c5b273ea72e7f653b3739c9768 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 3 Sep 2024 10:50:33 -0700
Subject: [PATCH 10/23] fixup! refactor phi functions

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  4 +--
 llvm/lib/Transforms/Vectorize/VPlan.h         | 31 ++++++++++++++++++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 +++++++++
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  | 26 ----------------
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    | 10 ------
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  6 ++--
 6 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e62e29b57f258b..9c6122eca106b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,7 +217,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
 
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
-  while (It != end() && vputils::isPhi(*It))
+  while (It != end() && It->isPhi())
     It++;
   return It;
 }
@@ -1066,7 +1066,7 @@ void VPlan::execute(VPTransformState *State) {
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
     // Skip phi-like recipes that generate their backedege values themselves.
-    if (vputils::isPhiThatGeneratesBackedge(R))
+    if (R.isPhiThatGeneratesBackedge())
       continue;
 
     if (isa<VPWidenPointerInductionRecipe>(&R) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a4ac07259b87fd..c4c9c1781e76af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -854,10 +854,29 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   bool mayHaveSideEffects() const;
 
   /// Returns true for PHI-like recipes.
-  bool isPhi() const {
+  virtual bool isPhi() const {
+    assert(getVPDefID() != VPInstructionSC &&
+           "VPInstructions implement this function themselves");
     return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
   }
 
+  /// Returns true for PHI-like recipes that exists in vector loop header basic
+  /// block
+  virtual bool isHeaderPhi() const {
+    assert(getVPDefID() != VPInstructionSC &&
+           "VPInstructions implement this function themselves");
+    return (getVPDefID() >= VPFirstHeaderPHISC &&
+            getVPDefID() <= VPLastHeaderPHISC) ||
+           getVPDefID() == VPWidenPHISC;
+  }
+
+  /// Returns true for PHI-like recipes that generate their own backedge
+  virtual bool isPhiThatGeneratesBackedge() const {
+    assert(getVPDefID() != VPInstructionSC &&
+           "VPInstructions implement this function themselves");
+    return getVPDefID() == VPWidenPHISC || getVPDefID() == VPCSAHeaderPHISC;
+  }
+
   /// Returns true if the recipe may read from memory.
   bool mayReadFromMemory() const;
 
@@ -1437,6 +1456,16 @@ class VPInstruction : public VPRecipeWithIRFlags,
   /// Returns true if this VPInstruction's operands are single scalars and the
   /// result is also a single scalar.
   bool isSingleScalar() const;
+
+  /// Returns true for PHI-like recipes.
+  bool isPhi() const override;
+
+  /// Returns true for PHI-like recipes that exists in vector loop header basic
+  /// block
+  bool isHeaderPhi() const override;
+
+  /// Returns true for PHI-like recipes that generate their own backedge
+  bool isPhiThatGeneratesBackedge() const override;
 };
 
 /// A recipe to wrap on original IR instruction not to be modified during
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33aaafd5b66357..658803e0e510bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -779,6 +779,21 @@ bool VPInstruction::isSingleScalar() const {
          getOpcode() == VPInstruction::ExplicitVectorLength;
 }
 
+bool VPInstruction::isPhi() const {
+  return getOpcode() == VPInstruction::CSAMaskPhi ||
+         getOpcode() == VPInstruction::CSAVLPhi;
+}
+
+bool VPInstruction::isHeaderPhi() const {
+  return getOpcode() == VPInstruction::CSAMaskPhi ||
+         getOpcode() == VPInstruction::CSAVLPhi;
+}
+
+bool VPInstruction::isPhiThatGeneratesBackedge() const {
+  return getOpcode() == VPInstruction::CSAMaskPhi ||
+         getOpcode() == VPInstruction::CSAVLPhi;
+}
+
 #if !defined(NDEBUG)
 bool VPInstruction::isFPMathOp() const {
   // Inspired by FPMathOperator::classof. Notable differences are that we don't
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 0a6e58f87f7ad4..ad47879c4c20b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -116,29 +116,3 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
       });
 }
 
-bool vputils::isPhi(const VPRecipeBase &R) {
-  if (R.isPhi())
-    return true;
-  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
-    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
-           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
-  return false;
-}
-
-bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
-  if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
-    return true;
-  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
-    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
-           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
-  return false;
-}
-
-bool vputils::isHeaderPhi(const VPRecipeBase &R) {
-  if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
-    return true;
-  if (auto *VPInst = dyn_cast<VPInstruction>(&R))
-    return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
-           VPInst->getOpcode() == VPInstruction::CSAVLPhi;
-  return false;
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 40cf8b3e87f122..96577700205213 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -61,16 +61,6 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan);
 /// VPDerivedIV or VPCanonicalIVPHI).
 bool isUniformAcrossVFsAndUFs(VPValue *V);
 
-/// Returns true for PHI-like recipes.
-bool isPhi(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that generate their own backedge
-bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that exists in vector loop header basic
-/// block
-bool isHeaderPhi(const VPRecipeBase &R);
-
 } // end namespace llvm::vputils
 
 #endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 2b9c32624a2916..808879b3f9c3b9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -74,11 +74,11 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
   const VPRegionBlock *ParentR = VPBB->getParent();
   bool IsHeaderVPBB = ParentR && !ParentR->isReplicator() &&
                       ParentR->getEntryBasicBlock() == VPBB;
-  while (RecipeI != End && vputils::isPhi(*RecipeI)) {
+  while (RecipeI != End && RecipeI->isPhi()) {
     if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
       NumActiveLaneMaskPhiRecipes++;
 
-    if (IsHeaderVPBB && !vputils::isHeaderPhi(*RecipeI)) {
+    if (IsHeaderVPBB && !RecipeI->isHeaderPhi()) {
       errs() << "Found non-header PHI recipe in header VPBB";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       errs() << ": ";
@@ -105,7 +105,7 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
   }
 
   while (RecipeI != End) {
-    if (vputils::isPhi(*RecipeI) && !isa<VPBlendRecipe>(&*RecipeI)) {
+    if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
       errs() << "Found phi-like recipe after non-phi recipe";
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

>From 35067656f100b82d87426520fede2a2d768dbdda Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 3 Sep 2024 11:24:50 -0700
Subject: [PATCH 11/23] fixup! update test checks

---
 .../LoopVectorize/conditional-scalar-assignment.ll        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
index 0269f0c672a3fa..48af3719c06ed1 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -2981,7 +2981,7 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   }
 ;   return t; // use t
 ; }
-define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-LABEL: @simple_csa_ptr_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -3002,7 +3002,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
 ; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
 ; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 ; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -3028,7 +3028,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -3054,7 +3054,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
 ; DATA-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
 ; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 ; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
 ; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
 ; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]

>From 25e20a70e04f394882159e9684261bbc0593be74 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 3 Sep 2024 11:59:34 -0700
Subject: [PATCH 12/23] fixup! use vpbuilder

---
 .../Vectorize/LoopVectorizationPlanner.h      | 31 +++++++++++++++++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 26 +++++++---------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00eec0a6f7b14e..9d568d4ddc8dc9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -174,8 +174,8 @@ class VPBuilder {
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
 
-  VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
-                     const Twine &Name = "") {
+  VPInstruction *createNot(VPValue *Operand, DebugLoc DL = {},
+                           const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
   }
 
@@ -225,6 +225,33 @@ class VPBuilder {
     return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
   }
 
+  VPInstruction *createCSAInitMask(DebugLoc DL, const Twine &Name) {
+    return createInstruction(VPInstruction::CSAInitMask, {}, DL, Name);
+  }
+
+  VPInstruction *createCSAInitData(VPValue *InitScalar, DebugLoc DL,
+                                   const Twine &Name) {
+    return createInstruction(VPInstruction::CSAInitData, {InitScalar}, DL,
+                             Name);
+  }
+
+  VPInstruction *createCSAMaskPhi(VPValue *InitMask, DebugLoc DL,
+                                  const Twine &Name) {
+    return createInstruction(VPInstruction::CSAMaskPhi, {InitMask}, DL, Name);
+  }
+
+  VPInstruction *createCSAAnyActive(VPValue *Cond, DebugLoc DL,
+                                    const Twine &Name) {
+    return createInstruction(VPInstruction::CSAAnyActive, {Cond}, DL, Name);
+  }
+
+  VPInstruction *createCSAMaskSel(VPValue *Cond, VPValue *MaskPhi,
+                                  VPValue *AnyActive, DebugLoc DL,
+                                  const Twine &Name) {
+    return createInstruction(VPInstruction::CSAMaskSel,
+                             {Cond, MaskPhi, AnyActive}, DL, Name);
+  }
+
   VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
                                      FPMathOperator *FPBinOp, VPValue *Start,
                                      VPCanonicalIVPHIRecipe *CanonicalIV,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d29a3347ff58ea..6d046451a7348d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8707,16 +8707,12 @@ addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
       continue;
     }
 
-    auto *VPInitMask =
-        new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
-    auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
-                                         {VPInitScalar}, DL, "csa.init.data");
-    PreheaderVPBB->appendRecipe(VPInitMask);
-    PreheaderVPBB->appendRecipe(VPInitData);
+    VPBuilder PHB(PreheaderVPBB);
+    auto *VPInitMask = PHB.createCSAInitMask(DL, "csa.init.mask");
+    auto *VPInitData = PHB.createCSAInitData(VPInitScalar, DL, "csa.init.data");
 
-    auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
-                                        DL, "csa.mask.phi");
-    HeaderVPBB->appendRecipe(VPMaskPhi);
+    VPBuilder HB(HeaderVPBB);
+    auto *VPMaskPhi = HB.createCSAMaskPhi(VPInitMask, DL, "csa.mask.phi");
 
     auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
     Plan.addCSAState(CSA.first, S);
@@ -8755,22 +8751,22 @@ addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
     // In that case, we must use the negation of WidenedCond.
     // i.e. select cond new_val old_val versus select cond.not old_val new_val
     VPValue *CondToUse = WidenedCond;
+    VPBuilder B;
     if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
         CSA.first) {
-      auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
+      auto *VPNotCond = B.createNot(WidenedCond, DL);
       VPNotCond->insertBefore(
           GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
       CondToUse = VPNotCond;
     }
 
-    auto *VPAnyActive = new VPInstruction(
-        VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
+    auto *VPAnyActive =
+        B.createCSAAnyActive(CondToUse, DL, "csa.cond.anyactive");
     VPAnyActive->insertBefore(
         GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
 
-    auto *VPMaskSel = new VPInstruction(
-        VPInstruction::CSAMaskSel,
-        {CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
+    auto *VPMaskSel = B.createCSAMaskSel(CondToUse, CSAState->getVPMaskPhi(),
+                                         VPAnyActive, DL, "csa.mask.sel");
     VPMaskSel->insertAfter(VPAnyActive);
     VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
     VPCSAExtractScalarRecipe *ExtractScalarRecipe =

>From 85f21b972724fbdbada7e26d0aeda86fe9a3ad02 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 9 Sep 2024 06:46:00 -0700
Subject: [PATCH 13/23] fixup! simplify tests

---
 .../RISCV/conditional-scalar-assignment.ll    |  360 +---
 .../conditional-scalar-assignment.ll          | 1510 +++++------------
 2 files changed, 474 insertions(+), 1396 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index 6d7816800603e2..d8dd1d34e2bec3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -5,9 +5,6 @@
 ; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
 ; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
 ; RUN:   | FileCheck %s -check-prefix=NO-EVL
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
-; RUN:   | FileCheck %s -check-prefix=DATA
 
 ; This function is generated from the following C/C++ program:
 ; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
@@ -20,7 +17,7 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-LABEL: @idx_scalar(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
@@ -75,15 +72,15 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
 ; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; EVL:       for.body:
 ; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -96,12 +93,12 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @idx_scalar(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
@@ -156,15 +153,15 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
 ; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; NO-EVL:       for.body:
 ; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -177,102 +174,17 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; DATA-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; DATA-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; DATA-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; DATA-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; DATA-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; DATA-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %cmp8.not = icmp eq i64 %n, 0
-  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+  br i1 %cmp8.not, label %exit, label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %cond.lcssa = phi i64 [ %cond, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
   ret i64 %idx.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
@@ -286,7 +198,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
   %inc = add nuw i64 %i.010, 1
   %exitcond.not = icmp eq i64 %inc, %n
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -300,7 +212,7 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-LABEL: @idx_scalar_dec(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -348,15 +260,15 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
 ; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; EVL:       for.body:
 ; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -369,12 +281,12 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
 ; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @idx_scalar_dec(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -422,15 +334,15 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
 ; NO-EVL:       for.body:
 ; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -443,95 +355,17 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
 ; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar_dec(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; DATA-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; DATA-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; DATA-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; DATA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; DATA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; DATA-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; DATA-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; DATA-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; DATA-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; DATA-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; DATA-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; DATA-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; DATA-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; DATA-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; DATA-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; DATA-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   %cmp.not9 = icmp eq i64 %n, 0
-  br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+  br i1 %cmp.not9, label %exit, label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %cond.lcssa = phi i64 [ %cond, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
   ret i64 %idx.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
@@ -545,10 +379,9 @@ for.body:                                         ; preds = %for.body.preheader,
   %cmp3 = icmp sgt i64 %0, %1
   %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
   %cmp.not = icmp eq i64 %sub, 0
-  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %cmp.not, label %exit, label %for.body
 }
 
-
 ; This function is generated from the following C/C++ program:
 ; int *simple_csa_ptr_select(int N, int **data) {
 ;   int *t = nullptr;
@@ -562,7 +395,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-LABEL: @simple_csa_ptr_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -617,15 +450,15 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
 ; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
 ; EVL:       for.body:
 ; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -638,12 +471,12 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_ptr_select(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -698,15 +531,15 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
 ; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
 ; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -719,103 +552,18 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_ptr_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; DATA-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; DATA-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; DATA-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; DATA-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; DATA-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; DATA-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; DATA-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; DATA-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:
-  %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:
+  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %for.body ]
   ret ptr %t.0.lcssa
 
 for.body:
@@ -829,5 +577,5 @@ for.body:
   %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit, label %for.body
 }
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
index 48af3719c06ed1..dc88d3d5d5528c 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -7,10 +7,6 @@
 ; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
 ; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
 ; RUN:   | FileCheck %s -check-prefix=NO-EVL
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
-; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
-; RUN:   | FileCheck %s -check-prefix=DATA
 
 ; This function is generated from the following C/C++ program:
 ; int simple_csa_int_select(int N, int *data, int a) {
@@ -25,7 +21,7 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-LABEL: @simple_csa_int_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -67,32 +63,32 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
 ; EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
 ; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_int_select(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -134,122 +130,51 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; DATA-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; DATA-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
-; DATA-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
-; DATA-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
-; DATA-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
-; DATA-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
-; DATA-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = sext i32 %0 to i64
   %cmp1 = icmp slt i64 %a, %1
   %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -265,7 +190,7 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-LABEL: @simple_csa_int_select_induction_cmp(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -315,32 +240,32 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
 ; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -390,130 +315,51 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
 ; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_induction_cmp(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; DATA-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; DATA-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; DATA-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; DATA-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; DATA-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; DATA-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; DATA-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; DATA-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %1
+  %cmp1 = icmp slt i64 %iv, %1
   %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -529,7 +375,7 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; EVL-LABEL: @simple_csa_float_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -568,31 +414,31 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
 ; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret float [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
 ; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_float_select(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -631,112 +477,49 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
 ; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
 ; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_float_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; DATA-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
-; DATA-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
-; DATA-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; DATA-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
-; DATA-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
-; DATA-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret float [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
-; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
-  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp8, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.body, %entry
+exit:                                 ; preds = %for.body, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
   ret float %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds float, ptr %data, i64 %iv
   %0 = load float, ptr %arrayidx, align 4
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %t.1 = select i1 %cmp1, float %0, float %t.09
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -755,39 +538,36 @@ define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
 ; NO-EVL-LABEL: @simple_csa_int(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @simple_csa_int(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp6, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
+exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %iv
   %1 = load i32, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
   %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -806,39 +586,36 @@ define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
 ; NO-EVL-LABEL: @simple_csa_float(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @simple_csa_float(
-; NO-EVL-NOT: vector.body:
-;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp6, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
+exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
   ret float %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %iv
   %1 = load float, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
   %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -857,7 +634,7 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-LABEL: @csa_in_series_int_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -919,40 +696,40 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
 ; EVL-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret i32 [[OR]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 ; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; EVL-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 ; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
 ; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @csa_in_series_int_select(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1014,164 +791,69 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
 ; NO-EVL-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; NO-EVL-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 ; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
 ; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; DATA-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; DATA-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
-; DATA-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; DATA-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; DATA-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
-; DATA-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; DATA-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
-; DATA-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
-; DATA-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
-; DATA-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
-; DATA-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
-; DATA-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp21, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
+exit.loopexit:                        ; preds = %for.body
   %0 = or i32 %s.1, %spec.select
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
   %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
   %1 = load i32, ptr %arrayidx, align 4
   %2 = sext i32 %1 to i64
   %cmp1 = icmp slt i64 %a, %2
   %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %3 = load i32, ptr %arrayidx5, align 4
   %4 = sext i32 %3 to i64
   %cmp6 = icmp slt i64 %a, %4
   %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1190,7 +872,7 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1260,40 +942,40 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; EVL-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
 ; EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret i32 [[OR]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
 ; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1363,172 +1045,69 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
 ; NO-EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; NO-EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
 ; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; DATA-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; DATA-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; DATA-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
-; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; DATA-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
-; DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
-; DATA-NEXT:    [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
-; DATA-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
-; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
-; DATA-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
-; DATA-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
-; DATA-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
-; DATA-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-; DATA-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
-; DATA-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
-; DATA-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret i32 [[OR]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; DATA-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp21, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
+exit.loopexit:                        ; preds = %for.body
   %0 = or i32 %s.1, %spec.select
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
   %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
   %1 = load i32, ptr %arrayidx, align 4
   %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %2
+  %cmp1 = icmp slt i64 %iv, %2
   %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %3 = load i32, ptr %arrayidx5, align 4
   %4 = sext i32 %3 to i64
-  %cmp6 = icmp slt i64 %indvars.iv, %4
+  %cmp6 = icmp slt i64 %iv, %4
   %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1548,7 +1127,7 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-LABEL: @csa_in_series_float_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1606,38 +1185,38 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
 ; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret float [[ADD]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
 ; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 ; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
 ; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @csa_in_series_float_select(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1695,154 +1274,65 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
 ; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret float [[ADD]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
 ; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 ; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
 ; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_float_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; DATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; DATA-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
-; DATA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
-; DATA-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
-; DATA-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
-; DATA-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
-; DATA-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; DATA-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; DATA-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
-; DATA-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; DATA-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
-; DATA-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; DATA-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; DATA-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
-; DATA-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; DATA-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT:    ret float [[ADD]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
-; DATA-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; DATA-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
-; DATA-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 entry:
   %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp19, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
+exit.loopexit:                        ; preds = %for.body
   %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
   %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
   %1 = load float, ptr %arrayidx, align 4
   %cmp1 = fcmp ogt float %1, 0.000000e+00
   %t.1 = select i1 %cmp1, float %1, float %t.020
-  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
   %2 = load float, ptr %arrayidx5, align 4
   %cmp6 = fcmp ogt float %2, 0.000000e+00
   %s.1 = select i1 %cmp6, float %2, float %s.021
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1864,56 +1354,53 @@ define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %d
 ; NO-EVL-LABEL: @csa_in_series_int(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_in_series_int(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
+exit.loopexit:                        ; preds = %for.inc
   %0 = or i32 %s.1, %t.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
   %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
   %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
   br i1 %tobool5.not, label %for.inc, label %if.then6
 
 if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %4 = load i32, ptr %arrayidx8, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end, %if.then6
   %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1936,56 +1423,53 @@ define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, pt
 ; NO-EVL-LABEL: @csa_in_series_float(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_in_series_float(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
+exit.loopexit:                        ; preds = %for.inc
   %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
   %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
   %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
   br i1 %tobool5.not, label %for.inc, label %if.then6
 
 if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
   %4 = load float, ptr %arrayidx8, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end, %if.then6
   %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2007,37 +1491,34 @@ define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1)
 ; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_in_series_same_scalar_int_select(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp21, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.body, %entry
+exit:                                 ; preds = %for.body, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %1
+  %cmp1 = icmp slt i64 %iv, %1
   %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %2 = load i32, ptr %arrayidx5, align 4
   %3 = sext i32 %2 to i64
-  %cmp6 = icmp slt i64 %indvars.iv, %3
+  %cmp6 = icmp slt i64 %iv, %3
   %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2059,35 +1540,32 @@ define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %da
 ; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_in_series_same_scalar_float_select(
-; NO-EVL-NOT: vector.body:
-;
 entry:
   %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp19, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.body, %entry
+exit:                                 ; preds = %for.body, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
   ret float %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
   %0 = load float, ptr %arrayidx, align 4
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %t.1 = select i1 %cmp1, float %0, float %t.020
-  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
   %1 = load float, ptr %arrayidx5, align 4
   %cmp6 = fcmp ogt float %1, 0.000000e+00
   %t.2 = select i1 %cmp6, float %1, float %t.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2109,51 +1587,48 @@ define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %d
 ; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_in_series_same_scalar_int(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
+exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %1 = load i32, ptr %arrayidx2, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
   %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %2 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %2, 0
   br i1 %tobool5.not, label %for.inc, label %if.then6
 
 if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %3 = load i32, ptr %arrayidx8, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end, %if.then6
   %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2175,51 +1650,48 @@ define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, pt
 ; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_in_series_same_scalar_float(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
+exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
   ret float %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %1 = load float, ptr %arrayidx2, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
   %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %2 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %2, 0
   br i1 %tobool5.not, label %for.inc, label %if.then6
 
 if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
   %3 = load float, ptr %arrayidx8, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end, %if.then6
   %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2241,47 +1713,44 @@ define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
 ; NO-EVL-LABEL: @csa_same_cond_int(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_same_cond_int(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
+exit.loopexit:                        ; preds = %for.inc
   %0 = or i32 %s.1, %t.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
   %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %3 = load i32, ptr %arrayidx4, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
   %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
   %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2303,47 +1772,44 @@ define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
 ; NO-EVL-LABEL: @csa_same_cond_float(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_same_cond_float(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
+exit.loopexit:                        ; preds = %for.inc
   %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
   %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %iv
   %3 = load float, ptr %arrayidx4, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
   %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
   %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2365,46 +1831,43 @@ define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %dat
 ; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_else_if_same_scalar_int(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
+exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.else, label %for.inc.sink.split
 
 if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %1 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %1, 0
   br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
 
 for.inc.sink.split:                               ; preds = %if.else, %for.body
   %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.inc.sink.split, %if.else
   %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2426,46 +1889,43 @@ define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr
 ; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_else_if_same_scalar_float(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.inc, %entry
+exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
   ret float %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.else, label %for.inc.sink.split
 
 if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %1 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %1, 0
   br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
 
 for.inc.sink.split:                               ; preds = %if.else, %for.body
   %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
-  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.inc.sink.split, %if.else
   %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2487,56 +1947,53 @@ define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %dat
 ; NO-EVL-LABEL: @csa_else_if_int(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_else_if_int(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
+exit.loopexit:                        ; preds = %for.inc
   %0 = or i32 %s.1, %t.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
   %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.else, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   br label %for.inc
 
 if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
   br i1 %tobool5.not, label %for.inc, label %if.then6
 
 if.then6:                                         ; preds = %if.else
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %4 = load i32, ptr %arrayidx8, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %if.then, %if.then6, %if.else
   %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
   %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2559,56 +2016,53 @@ define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr
 ; NO-EVL-LABEL: @csa_else_if_float(
 ; EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @csa_else_if_float(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp15, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
+exit.loopexit:                        ; preds = %for.inc
   %0 = fadd float %t.1, %s.1
-  br label %for.cond.cleanup
+  br label %exit
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
 for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
   %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
   %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.else, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   br label %for.inc
 
 if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
   br i1 %tobool5.not, label %for.inc, label %if.then6
 
 if.then6:                                         ; preds = %if.else
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
   %4 = load float, ptr %arrayidx8, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %if.then, %if.then6, %if.else
   %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
   %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2625,22 +2079,15 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-LABEL: @idx_scalar(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @idx_scalar(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp8.not = icmp eq i64 %n, 0
-  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+  br i1 %cmp8.not, label %exit, label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %cond.lcssa = phi i64 [ %cond, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
   ret i64 %idx.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
@@ -2654,7 +2101,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
   %inc = add nuw i64 %i.010, 1
   %exitcond.not = icmp eq i64 %inc, %n
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2671,22 +2118,15 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-LABEL: @idx_scalar_dec(
 ; NO-EVL-NOT: vector.body:
 ;
-; DATA-LABEL: @idx_scalar_dec(
-; DATA-NOT: vector.body:
-;
 entry:
   %cmp.not9 = icmp eq i64 %n, 0
-  br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+  br i1 %cmp.not9, label %exit, label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %cond.lcssa = phi i64 [ %cond, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
   ret i64 %idx.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
@@ -2700,7 +2140,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %cmp3 = icmp sgt i64 %0, %1
   %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
   %cmp.not = icmp eq i64 %sub, 0
-  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %cmp.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2718,7 +2158,7 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; EVL-LABEL: @simple_csa_int_select_neg_cond(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -2769,32 +2209,32 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
 ; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 ; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -2845,131 +2285,51 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
 ; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_neg_cond(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA:       vector.ph:
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; DATA-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; DATA-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; DATA-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; DATA-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; DATA-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA:       vector.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; DATA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT:    [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; DATA-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
-; DATA-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; DATA:       middle.block:
-; DATA-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT:    [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
-; DATA-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; DATA-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; DATA-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; DATA-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
-; DATA-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; DATA-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
-; DATA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA:       scalar.ph:
-; DATA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT:    ret i32 [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; DATA-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ]
   ret i32 %t.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = zext i32 %0 to i64
-  %cmp1.not = icmp eq i64 %indvars.iv, %1
+  %cmp1.not = icmp eq i64 %iv, %1
   %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2985,107 +2345,77 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-LABEL: @simple_csa_ptr_select(
 ; EVL-NEXT:  entry:
 ; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; EVL:       for.body.preheader:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       for.cond.cleanup.loopexit:
+; EVL:       exit.loopexit:
 ; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; EVL:       for.cond.cleanup:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT:%.*]] ]
 ; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
 ; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[IV]]
 ; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
 ; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 ; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
 ; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]]
 ;
 ; NO-EVL-LABEL: @simple_csa_ptr_select(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       for.cond.cleanup.loopexit:
+; NO-EVL:       exit.loopexit:
 ; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
-; NO-EVL:       for.cond.cleanup:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT:%.*]] ]
 ; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @simple_csa_ptr_select(
-; DATA-NEXT:  entry:
-; DATA-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA:       for.body.preheader:
-; DATA-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT:    br label [[FOR_BODY:%.*]]
-; DATA:       for.cond.cleanup.loopexit:
-; DATA-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DATA:       for.cond.cleanup:
-; DATA-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT:    ret ptr [[T_0_LCSSA]]
-; DATA:       for.body:
-; DATA-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; DATA-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; DATA-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; DATA-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp9, label %for.body.preheader, label %exit
 
 for.body.preheader:
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:
-  %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:
+  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %for.body ]
   ret ptr %t.0.lcssa
 
 for.body:
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %iv
   %0 = load ptr, ptr %arrayidx, align 8
   %1 = load i32, ptr %0, align 4
   %2 = sext i32 %1 to i64
   %cmp1 = icmp slt i64 %a, %2
   %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
 }

>From 7537b6610e68da5119752d52f70cac63978371ff Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 17 Sep 2024 15:38:53 -0700
Subject: [PATCH 14/23] fixup! fix cost functions

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 658803e0e510bd..fc258537b01a58 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2366,15 +2366,17 @@ InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
 
   // FIXME: These costs should be moved into VPInstruction::computeCost. We put
   // them here for now since they are related to updating the data and there is
-  // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
-  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
-  // vp.reduce.or
+  // no VPInstruction::computeCost support at the moment.
+
+  // CSAAnyActive
   C += TTI.getArithmeticReductionCost(Instruction::Or, VTy, std::nullopt,
                                       CostKind);
   // VPVLSel
-  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+  C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
   // MaskUpdate
-  C += TTI.getArithmeticInstrCost(Instruction::Select, MaskTy, CostKind);
+  C += TTI.getCmpSelInstrCost(Instruction::Select, MaskTy, MaskTy,
+                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
   return C;
 }
 
@@ -2464,31 +2466,31 @@ VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
                                     CostKind);
   } else {
     // ActiveLaneIdxs
-    C += TTI.getArithmeticInstrCost(Instruction::Select,
-                                    MaskTy->getScalarType(), CostKind);
+    C += TTI.getCmpSelInstrCost(Instruction::Select, MaskTy, MaskTy,
+                                CmpInst::BAD_ICMP_PREDICATE, CostKind);
     // MaybeLastIdx
     C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
                                     CostKind);
     // IsLaneZeroActive
-    C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, MaskTy,
-                                    CostKind);
+    C += TTI.getVectorInstrCost(Instruction::ExtractElement, MaskTy, CostKind);
     // MaybeLastIdxEQZero
-    C += TTI.getArithmeticInstrCost(Instruction::ICmp, MaskTy->getScalarType(),
-                                    CostKind);
+    C += TTI.getCmpSelInstrCost(Instruction::ICmp, Int32VTy, MaskTy,
+                                CmpInst::ICMP_EQ, CostKind);
     // And
     C += TTI.getArithmeticInstrCost(Instruction::And, MaskTy->getScalarType(),
                                     CostKind);
     // LastIdx
-    C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
-                                    CostKind);
+    C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+                                CmpInst::BAD_ICMP_PREDICATE, CostKind);
   }
   // ExtractFromVec
   C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
-  // LastIdxGeZero
-  C += TTI.getArithmeticInstrCost(Instruction::ICmp, Int32VTy, CostKind);
+  // LastIdxGEZero
+  C += TTI.getCmpSelInstrCost(Instruction::ICmp, Int32VTy, MaskTy,
+                              CmpInst::ICMP_SGE, CostKind);
   // ChooseFromVecOrInit
-  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
-                                  CostKind);
+  C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
   return C;
 }
 

>From d94506a234cce06c744e196d050b3b7a2abe3077 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 17 Sep 2024 20:26:48 -0700
Subject: [PATCH 15/23] fixup! fix extract cost

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fc258537b01a58..403c53ba9dd3b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2484,7 +2484,7 @@ VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
                                 CmpInst::BAD_ICMP_PREDICATE, CostKind);
   }
   // ExtractFromVec
-  C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
+  C += TTI.getVectorInstrCost(Instruction::ExtractElement, VTy, CostKind);
   // LastIdxGEZero
   C += TTI.getCmpSelInstrCost(Instruction::ICmp, Int32VTy, MaskTy,
                               CmpInst::ICMP_SGE, CostKind);

>From 50f87ac3a6bb1f86b624d8c91ce2af9dbab83782 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Wed, 18 Sep 2024 09:35:15 -0700
Subject: [PATCH 16/23] fixup! update tests after rebase

---
 .../RISCV/conditional-scalar-assignment.ll    | 228 +++++-----
 .../conditional-scalar-assignment.ll          | 430 +++++++++---------
 2 files changed, 322 insertions(+), 336 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index d8dd1d34e2bec3..7c625e0d98e569 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -34,10 +34,8 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
 ; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
 ; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; EVL:       vector.body:
@@ -45,39 +43,39 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP10]]
 ; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; EVL-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP15]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP16]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP16]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; EVL:       middle.block:
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP18]])
+; EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP23]]
+; EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       exit.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[EXIT]]
 ; EVL:       exit:
 ; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -86,10 +84,10 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; EVL-NEXT:    [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP26]], [[TMP27]]
 ; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -115,10 +113,8 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
@@ -126,39 +122,39 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP15]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP16]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP16]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-EVL:       middle.block:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP18]])
+; NO-EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP23]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[EXIT]]
 ; NO-EVL:       exit:
 ; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -167,10 +163,10 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; NO-EVL-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP26]], [[TMP27]]
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -413,10 +409,8 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
 ; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
 ; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; EVL:       vector.body:
@@ -424,38 +418,38 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP10]]
+; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds ptr, ptr [[TMP11]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP12]], align 8
 ; EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP13]]
+; EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP14]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; EVL:       middle.block:
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP17]])
+; EVL-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
+; EVL-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP22]]
+; EVL-NEXT:    [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
+; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], ptr [[CSA_EXTRACT]], ptr null
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[EXIT]]
 ; EVL:       exit:
 ; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -464,11 +458,11 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; EVL-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; EVL-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP27]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP25]], ptr [[T_010]]
 ; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -494,10 +488,8 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
@@ -505,38 +497,38 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds ptr, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP12]], align 8
 ; NO-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP13]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP14]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NO-EVL:       middle.block:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP17]])
+; NO-EVL-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP22]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], ptr [[CSA_EXTRACT]], ptr null
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[EXIT]]
 ; NO-EVL:       exit:
 ; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -545,11 +537,11 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; NO-EVL-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; NO-EVL-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP27]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP25]], ptr [[T_010]]
 ; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
index dc88d3d5d5528c..8f3259b2565e3d 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -205,9 +205,8 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
 ; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
 ; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
 ; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; EVL:       vector.body:
@@ -215,37 +214,37 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; EVL:       middle.block:
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP14]])
+; EVL-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 0
+; EVL-NEXT:    [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
+; EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP19]]
+; EVL-NEXT:    [[TMP20:%.*]] = icmp sge i32 [[TMP19]], 0
+; EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[CSA_EXTRACT]], i32 -1
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[EXIT]]
 ; EVL:       exit:
 ; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -254,10 +253,10 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP23]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP22]], i32 [[T_010]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -280,9 +279,8 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
 ; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
@@ -290,37 +288,37 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-EVL:       middle.block:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; NO-EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; NO-EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP14]])
+; NO-EVL-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 0
+; NO-EVL-NEXT:    [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP19]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp sge i32 [[TMP19]], 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[EXIT]]
 ; NO-EVL:       exit:
 ; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -329,10 +327,10 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP23]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP22]], i32 [[T_010]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -887,9 +885,8 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
 ; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
 ; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
 ; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; EVL:       vector.body:
@@ -899,75 +896,75 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
-; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
-; EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
-; EVL-NEXT:    [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
-; EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP7]]
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP7]]
+; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP16]])
+; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP17]], <vscale x 1 x i1> [[TMP16]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP17]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; EVL:       middle.block:
 ; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
-; EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
-; EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP19]])
+; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP24]]
+; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT7]], i32 -1
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
-; EVL-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
-; EVL-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-; EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
-; EVL-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
-; EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    [[TMP27:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP27]])
+; EVL-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
+; EVL-NEXT:    [[TMP31:%.*]] = and i1 [[TMP29]], [[TMP30]]
+; EVL-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP32]]
+; EVL-NEXT:    [[TMP33:%.*]] = icmp sge i32 [[TMP32]], 0
+; EVL-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[CSA_EXTRACT]], i32 -1
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP35:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; EVL-NEXT:    br label [[EXIT]]
 ; EVL:       exit:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP35]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; EVL-NEXT:    ret i32 [[OR]]
 ; EVL:       for.body:
 ; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; EVL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP37]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP36]], i32 [[T_022]]
 ; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; EVL-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP39]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP38]], i32 [[S_023]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -990,9 +987,8 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
 ; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
@@ -1002,75 +998,75 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
-; NO-EVL-NEXT:    [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP16]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP17]], <vscale x 1 x i1> [[TMP16]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP17]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NO-EVL:       middle.block:
 ; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
-; NO-EVL-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
-; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP19]])
+; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP24]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT7]], i32 -1
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
-; NO-EVL-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
-; NO-EVL-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-; NO-EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
-; NO-EVL-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
-; NO-EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[TMP27:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP27]])
+; NO-EVL-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
+; NO-EVL-NEXT:    [[TMP31:%.*]] = and i1 [[TMP29]], [[TMP30]]
+; NO-EVL-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP32]]
+; NO-EVL-NEXT:    [[TMP33:%.*]] = icmp sge i32 [[TMP32]], 0
+; NO-EVL-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP35:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
 ; NO-EVL-NEXT:    br label [[EXIT]]
 ; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP35]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
 ; NO-EVL:       for.body:
 ; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP37]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP36]], i32 [[T_022]]
 ; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP39]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP38]], i32 [[S_023]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -2173,9 +2169,8 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
 ; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
 ; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
 ; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; EVL:       vector.body:
@@ -2183,38 +2178,38 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT:    [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; EVL-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; EVL-NEXT:    [[TMP10:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; EVL-NEXT:    [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; EVL:       middle.block:
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
-; EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
-; EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 0
 ; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; EVL:       scalar.ph:
 ; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
 ; EVL-NEXT:    br label [[EXIT]]
 ; EVL:       exit:
 ; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -2223,10 +2218,10 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i64
+; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP24]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP23]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -2249,9 +2244,8 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
 ; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
@@ -2259,38 +2253,38 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT:    [[TMP10:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; NO-EVL:       middle.block:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
-; NO-EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; NO-EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
-; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; NO-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; NO-EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 0
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[EXIT]]
 ; NO-EVL:       exit:
 ; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -2299,10 +2293,10 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i64
+; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP24]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP23]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]

>From b2f4feb05e9059440d0ef8ea3a488d367f202eeb Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 23 Sep 2024 17:47:54 -0700
Subject: [PATCH 17/23] fixup! use cmpsel

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 403c53ba9dd3b5..b4b50456821ace 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2362,7 +2362,8 @@ InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
   const TargetTransformInfo &TTI = Ctx.TTI;
 
   // Data Update
-  C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+  C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
 
   // FIXME: These costs should be moved into VPInstruction::computeCost. We put
   // them here for now since they are related to updating the data and there is

>From b7662c851270896ed4cb3098d28e251dd6e5c444 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 26 Sep 2024 11:21:20 -0700
Subject: [PATCH 18/23] fixup! respond to reviews

---
 .../Vectorize/LoopVectorizationLegality.cpp      |  2 +-
 .../Vectorize/LoopVectorizationPlanner.h         | 16 ++++++++++++++++
 .../lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++-------
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ec67fc05ffcf78..9f6028e398e68c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1878,7 +1878,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
     // Check that all users of allowed exit values are inside the loop or
-    // are the live-out of a reduction or a CSA
+    // are the live-out of a reduction or a CSA.
     if (ReductionLiveOuts.count(AE) || CSALiveOuts.count(AE))
       continue;
     for (User *U : AE->users()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 9d568d4ddc8dc9..77c425a6710a3c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -252,6 +252,22 @@ class VPBuilder {
                              {Cond, MaskPhi, AnyActive}, DL, Name);
   }
 
+  VPInstruction *createCSAAnyActiveEVL(VPValue *Cond, VPValue *EVL, DebugLoc DL,
+                                       const Twine &Name) {
+    return createInstruction(VPInstruction::CSAAnyActiveEVL, {Cond, EVL}, DL,
+                             Name);
+  }
+
+  VPInstruction *createCSAVLPhi(DebugLoc DL, const Twine &Name) {
+    return createInstruction(VPInstruction::CSAVLPhi, {}, DL, Name);
+  }
+
+  VPInstruction *createCSAVLSel(VPValue *AnyActiveEVL, VPValue *VLPhi,
+                                VPValue *EVL, DebugLoc DL, const Twine &Name) {
+    return createInstruction(VPInstruction::CSAVLSel,
+                             {AnyActiveEVL, VLPhi, EVL}, DL, Name);
+  }
+
   VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
                                      FPMathOperator *FPBinOp, VPValue *Start,
                                      VPCanonicalIVPHIRecipe *CanonicalIV,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0788b1faaeb0dd..a4cc358d0011c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1361,9 +1361,10 @@ static void addExplicitVectorLengthForCSA(
     // EVL number of elements in the mask. Replace CSAAnyActive with the EVL
     // specific CSAAnyActiveEVL instruction.
     auto *VPAnyActive = CSAState->getVPAnyActive();
-    auto *VPAnyActiveEVL = new VPInstruction(
-        VPInstruction::CSAAnyActiveEVL, {VPAnyActive->getOperand(0), &EVL},
-        VPAnyActive->getDebugLoc(), "csa.cond.anyactive");
+    VPBuilder B;
+    auto *VPAnyActiveEVL = B.createCSAAnyActiveEVL(
+        VPAnyActive->getOperand(0), &EVL, VPAnyActive->getDebugLoc(),
+        "csa.cond.anyactive");
     VPAnyActiveEVL->insertBefore(VPAnyActive);
     VPAnyActive->replaceAllUsesWith(VPAnyActiveEVL->getVPSingleValue());
     VPAnyActive->eraseFromParent();
@@ -1376,11 +1377,10 @@ static void addExplicitVectorLengthForCSA(
     // extract the scalar from the data vector, we must use the EVL that
     // corresponds to the EVL that was used when the mask vector was last
     // updated. To do this, we introduce CSAVLPhi and CSAVLSel instructions
-    auto *VPVLPhi =
-        new VPInstruction(VPInstruction::CSAVLPhi, {}, {}, "csa.vl.phi");
+
+    auto *VPVLPhi = B.createCSAVLPhi({}, "csa.vl.phi");
     auto *VPVLSel =
-        new VPInstruction(VPInstruction::CSAVLSel,
-                          {VPAnyActiveEVL, VPVLPhi, &EVL}, {}, "csa.vl.sel");
+        B.createCSAVLSel(VPAnyActiveEVL, VPVLPhi, &EVL, {}, "csa.vl.sel");
     VPVLPhi->insertAfter(CSAState->getPhiRecipe());
     VPVLSel->insertAfter(VPAnyActiveEVL);
 

>From 794d45960a1e8f2d0a1c4b6baa5465fa71f427a4 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 27 Sep 2024 10:08:20 -0700
Subject: [PATCH 19/23] fixup! rebase, respond to reviews, and refactor tests

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |    4 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  106 +-
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   11 +-
 .../RISCV/conditional-scalar-assignment.ll    | 2344 +++++++++++++---
 .../conditional-scalar-assignment.ll          | 2415 -----------------
 5 files changed, 2027 insertions(+), 2853 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d046451a7348d..295e9b93a23028 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2926,8 +2926,8 @@ void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
     assert(VPDataUpdate &&
            "VPDataUpdate must have been introduced prior to fixing live outs");
     Value *V = VPDataUpdate->getUnderlyingValue();
-    Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
-                                       /*NeedsScalar=*/true);
+    Value *ExtractedScalar =
+        State.get(CSA.second->getExtractScalarRecipe(), /*NeedsScalar=*/true);
     // Fix LCSSAPhis
     llvm::SmallPtrSet<PHINode *, 2> ToFix;
     for (User *U : V->users())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b4b50456821ace..61c54050611ba9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -674,61 +674,46 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return NewPhi;
   }
   case VPInstruction::CSAInitMask: {
-    if (Part == 0) {
-      Value *InitMask = ConstantAggregateZero::get(VectorType::get(
-          Type::getInt1Ty(State.Builder.getContext()), State.VF));
-      State.set(this, InitMask, Part);
-      return InitMask;
-    }
-    return State.get(this, Part - 1);
+    Value *InitMask = ConstantAggregateZero::get(
+        VectorType::get(Type::getInt1Ty(State.Builder.getContext()), State.VF));
+    State.set(this, InitMask);
+    return InitMask;
   }
   case VPInstruction::CSAInitData: {
-    if (Part == 0) {
-      Type *ElemTyp = getOperand(0)->getUnderlyingValue()->getType();
-      Value *InitData = PoisonValue::get(VectorType::get(ElemTyp, State.VF));
-      State.set(this, InitData, Part);
-      return InitData;
-    }
-    return State.get(this, Part - 1);
+    Type *ElemTyp = getOperand(0)->getUnderlyingValue()->getType();
+    Value *InitData = PoisonValue::get(VectorType::get(ElemTyp, State.VF));
+    State.set(this, InitData);
+    return InitData;
   }
   case VPInstruction::CSAMaskPhi: {
-    if (Part == 0) {
-      IRBuilder<>::InsertPointGuard Guard(State.Builder);
-      State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
-      BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
-      Value *InitMask = State.get(getOperand(0), Part);
-      PHINode *MaskPhi =
-          State.Builder.CreatePHI(InitMask->getType(), 2, "csa.mask.phi");
-      MaskPhi->addIncoming(InitMask, PreheaderBB);
-      State.set(this, MaskPhi, Part);
-      return MaskPhi;
-    }
-    Value *V = State.get(this, Part - 1);
-    return V;
+    IRBuilder<>::InsertPointGuard Guard(State.Builder);
+    State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+    BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+    Value *InitMask = State.get(getOperand(0));
+    PHINode *MaskPhi =
+        State.Builder.CreatePHI(InitMask->getType(), 2, "csa.mask.phi");
+    MaskPhi->addIncoming(InitMask, PreheaderBB);
+    State.set(this, MaskPhi);
+    return MaskPhi;
   }
   case VPInstruction::CSAMaskSel: {
-    Value *WidenedCond = State.get(getOperand(0), Part);
-    Value *MaskPhi = State.get(getOperand(1), Part);
-    Value *AnyActive = State.get(getOperand(2), Part, /*NeedsScalar=*/true);
-    // If not the first Part, use the mask from the previous unrolled Part
-    Value *OldMask = Part == 0 ? MaskPhi : State.get(this, Part - 1);
-    Value *MaskSel = State.Builder.CreateSelect(AnyActive, WidenedCond, OldMask,
+    Value *WidenedCond = State.get(getOperand(0));
+    Value *MaskPhi = State.get(getOperand(1));
+    Value *AnyActive = State.get(getOperand(2), /*NeedsScalar=*/true);
+    Value *MaskSel = State.Builder.CreateSelect(AnyActive, WidenedCond, MaskPhi,
                                                 "csa.mask.sel");
-    // MaskPhi wants to use the most recently updated mask. That's the one
-    // that corresponds to the last Part.
-    if (Part == State.UF - 1)
-      cast<PHINode>(MaskPhi)->addIncoming(MaskSel, State.CFG.PrevBB);
+    cast<PHINode>(MaskPhi)->addIncoming(MaskSel, State.CFG.PrevBB);
     return MaskSel;
   }
   case VPInstruction::CSAAnyActive: {
-    Value *WidenedCond = State.get(getOperand(0), Part);
+    Value *WidenedCond = State.get(getOperand(0));
     return Builder.CreateOrReduce(WidenedCond);
   }
   case VPInstruction::CSAAnyActiveEVL: {
-    Value *WidenedCond = State.get(getOperand(0), Part);
+    Value *WidenedCond = State.get(getOperand(0));
     Value *AllOnesMask = Constant::getAllOnesValue(
         VectorType::get(Type::getInt1Ty(State.Builder.getContext()), State.VF));
-    Value *EVL = State.get(getOperand(1), Part, /*NeedsScalar=*/true);
+    Value *EVL = State.get(getOperand(1), /*NeedsScalar=*/true);
 
     Value *StartValue =
         ConstantInt::get(WidenedCond->getType()->getScalarType(), 0);
@@ -751,9 +736,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return VLPhi;
   }
   case VPInstruction::CSAVLSel: {
-    Value *AnyActive = State.get(getOperand(0), Part, /*NeedsScalar=*/true);
-    Value *VLPhi = State.get(getOperand(1), Part, /*NeedsScalar=*/true);
-    Value *EVL = State.get(getOperand(2), Part, /*NeedsScalar=*/true);
+    Value *AnyActive = State.get(getOperand(0), /*NeedsScalar=*/true);
+    Value *VLPhi = State.get(getOperand(1), /*NeedsScalar=*/true);
+    Value *EVL = State.get(getOperand(2), /*NeedsScalar=*/true);
     Value *VLSel =
         State.Builder.CreateSelect(AnyActive, EVL, VLPhi, "csa.vl.sel");
     cast<PHINode>(VLPhi)->addIncoming(VLSel, State.CFG.PrevBB);
@@ -2299,9 +2284,7 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
   // may not have been executed. We let VPCSADataUpdateRecipe::execute add the
   // incoming operand to DataPhi.
 
-  // Use the same DataPhi for all Parts
-  for (unsigned Part = 0; Part < State.UF; ++Part)
-    State.set(this, DataPhi, Part);
+  State.set(this, DataPhi);
 }
 
 InstructionCost VPCSAHeaderPHIRecipe::computeCost(ElementCount VF,
@@ -2333,21 +2316,15 @@ void VPCSADataUpdateRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
-  for (unsigned Part = 0; Part < State.UF; ++Part) {
-    Value *AnyActive = State.get(getVPAnyActive(), Part, /*NeedsScalar=*/true);
-    Value *DataUpdate = getVPDataPhi() == getVPTrue()
-                            ? State.get(getVPFalse(), Part)
-                            : State.get(getVPTrue(), Part);
-    PHINode *DataPhi = cast<PHINode>(State.get(getVPDataPhi(), Part));
-    // If not the first Part, use the mask from the previous unrolled Part
-    Value *OldData = Part == 0 ? DataPhi : State.get(this, Part - 1);
-    Value *DataSel = State.Builder.CreateSelect(AnyActive, DataUpdate, OldData,
-                                                "csa.data.sel");
+  Value *AnyActive = State.get(getVPAnyActive(), /*NeedsScalar=*/true);
+  Value *DataUpdate = getVPDataPhi() == getVPTrue() ? State.get(getVPFalse())
+                                                    : State.get(getVPTrue());
+  PHINode *DataPhi = cast<PHINode>(State.get(getVPDataPhi()));
+  Value *DataSel = State.Builder.CreateSelect(AnyActive, DataUpdate, DataPhi,
+                                              "csa.data.sel");
 
-    if (Part == State.UF - 1)
-      DataPhi->addIncoming(DataSel, State.CFG.PrevBB);
-    State.set(this, DataSel, Part);
-  }
+  DataPhi->addIncoming(DataSel, State.CFG.PrevBB);
+  State.set(this, DataSel);
 }
 
 InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
@@ -2395,10 +2372,9 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
   IRBuilder<>::InsertPointGuard Guard(State.Builder);
   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
 
-  unsigned LastPart = State.UF - 1;
   Value *InitScalar = getVPInitScalar()->getLiveInIRValue();
-  Value *MaskSel = State.get(getVPMaskSel(), LastPart);
-  Value *DataSel = State.get(getVPDataSel(), LastPart);
+  Value *MaskSel = State.get(getVPMaskSel());
+  Value *DataSel = State.get(getVPDataSel());
 
   Value *LastIdx = nullptr;
   Value *IndexVec = State.Builder.CreateStepVector(
@@ -2410,7 +2386,7 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
     // index in the data vector to extract from. If no element in the mask
     // is active, we pick -1. If we pick -1, then we will use the initial scalar
     // value instead of extracting from the data vector.
-    Value *VL = State.get(getVPCSAVLSel(), LastPart, /*NeedsScalar=*/true);
+    Value *VL = State.get(getVPCSAVLSel(), /*NeedsScalar=*/true);
     LastIdx = State.Builder.CreateIntrinsic(NegOne->getType(),
                                             Intrinsic::vp_reduce_smax,
                                             {NegOne, IndexVec, MaskSel, VL});
@@ -2438,7 +2414,7 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
   Value *LastIdxGEZero = State.Builder.CreateICmpSGE(LastIdx, Zero);
   Value *ChooseFromVecOrInit =
       State.Builder.CreateSelect(LastIdxGEZero, ExtractFromVec, InitScalar);
-  State.set(this, ChooseFromVecOrInit, 0, /*IsScalar=*/true);
+  State.set(this, ChooseFromVecOrInit, /*IsScalar=*/true);
 }
 
 InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 808879b3f9c3b9..e7fea8ac2fafa8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -155,9 +155,14 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
              .Case<VPScalarCastRecipe>(
                  [&](const VPScalarCastRecipe *S) { return true; })
              .Case<VPInstruction>([&](const VPInstruction *I) {
-               if (I->getOpcode() != Instruction::Add) {
-                 errs()
-                     << "EVL is used as an operand in non-VPInstruction::Add\n";
+               unsigned Opc = I->getOpcode();
+               if (Opc == VPInstruction::CSAAnyActiveEVL ||
+                   Opc == VPInstruction::CSAVLSel)
+                 return true;
+
+               if (Opc != Instruction::Add) {
+                 errs() << "EVL is used as an operand that does not expect EVL "
+                           "operand\n";
                  return false;
                }
                if (I->getNumUsers() != 1) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index 7c625e0d98e569..cd6a55fba46032 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -1,10 +1,1876 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
-; RUN:   | FileCheck %s -check-prefix=EVL
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
-; RUN:   -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
-; RUN:   | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN:   -prefer-predicate-over-epilogue=predicate-dont-vectorize  \
+; RUN:   -force-tail-folding-style=data-with-evl -enable-csa-vectorization \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,EVL
+; RUN: opt < %s -S -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN:   -force-tail-folding-style=none -enable-csa-vectorization \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,NO-EVL
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data, int a) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
+; EVL-LABEL: @simple_csa_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI]]
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP5]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP13]]
+; EVL-NEXT:    [[TMP14:%.*]] = icmp sge i32 [[TMP13]], 0
+; EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       exit.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; EVL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP17]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP16]], i32 [[T_010]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; NO-EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; NO-EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       exit.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %a, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select_induction_cmp(int N, int *data) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; NO-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP13]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP14]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; NO-EVL-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP22]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       exit.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP26]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP25]], i32 [[T_010]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %iv, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float_select(int N, float *data) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define float @simple_csa_float_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[VP_OP_LOAD]], zeroinitializer
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI]]
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP5]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP12]]
+; EVL-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 0
+; EVL-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       exit.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    ret float [[T_0_LCSSA]]
+; EVL:       for.body:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
+; EVL-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP15]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP15]], float [[T_09]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; NO-EVL-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; NO-EVL-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       exit.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  br i1 %cmp8, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data, i64 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %0, float %t.09
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int(int N, bool *cond, int *data) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
+; CHECK-LABEL: @simple_csa_int(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float(int N, bool *cond, float *data) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
+; CHECK-LABEL: @simple_csa_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %iv
+  %1 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (a < data0[i])
+;       t = data0[i];
+;     if (a < data1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
+; EVL-LABEL: @csa_in_series_int_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI3]]
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD4]] to <vscale x 4 x i64>
+; EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE5:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL6]] = select i1 [[CSA_COND_ANYACTIVE5]], i32 [[TMP5]], i32 [[CSA_VL_PHI]]
+; EVL-NEXT:    [[CSA_MASK_SEL7]] = select i1 [[CSA_COND_ANYACTIVE5]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL8]] = select i1 [[CSA_COND_ANYACTIVE5]], <vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP5]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP9]], <vscale x 4 x i1> [[CSA_MASK_SEL7]], i32 [[CSA_VL_SEL6]])
+; EVL-NEXT:    [[CSA_EXTRACT10:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL8]], i32 [[TMP17]]
+; EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[CSA_EXTRACT10]], i32 -1
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       exit.loopexit:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP23:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP23]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       for.body:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP25]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP24]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP27]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP26]], i32 [[S_023]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-EVL-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; NO-EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; NO-EVL-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; NO-EVL-NEXT:    [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; NO-EVL-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; NO-EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       exit.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.body
+  %0 = or i32 %s.1, %spec.select
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %a, %2
+  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %3 = load i32, ptr %arrayidx5, align 4
+  %4 = sext i32 %3 to i64
+  %cmp6 = icmp slt i64 %a, %4
+  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data0[i])
+;       t = data0[i];
+;     if (i < data1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; NO-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP13]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP14]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
+; NO-EVL-NEXT:    [[TMP18:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP18]]
+; NO-EVL-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP19]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP20]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP20]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP22]])
+; NO-EVL-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP23]], 0
+; NO-EVL-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
+; NO-EVL-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP27]]
+; NO-EVL-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP27]], 0
+; NO-EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP30:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP30]])
+; NO-EVL-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP31]], 0
+; NO-EVL-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
+; NO-EVL-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP35]]
+; NO-EVL-NEXT:    [[TMP36:%.*]] = icmp sge i32 [[TMP35]], 0
+; NO-EVL-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       exit.loopexit:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP38:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP38]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret i32 [[OR]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP39]], i32 [[T_022]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP42]]
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP41]], i32 [[S_023]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.body
+  %0 = or i32 %s.1, %spec.select
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = sext i32 %1 to i64
+  %cmp1 = icmp slt i64 %iv, %2
+  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %3 = load i32, ptr %arrayidx5, align 4
+  %4 = sext i32 %3 to i64
+  %cmp6 = icmp slt i64 %iv, %4
+  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float_select(int N, float *data0,
+;                           float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data0[i])
+;       t = data0[i];
+;     if (0.0f <data1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float_select(
+; EVL-NEXT:  entry:
+; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; EVL:       for.body.preheader:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL:       vector.ph:
+; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EVL:       vector.body:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[VP_OP_LOAD]], zeroinitializer
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI3]]
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP12:%.*]] = fcmp ogt <vscale x 4 x float> [[VP_OP_LOAD4]], zeroinitializer
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE5:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL6]] = select i1 [[CSA_COND_ANYACTIVE5]], i32 [[TMP5]], i32 [[CSA_VL_PHI]]
+; EVL-NEXT:    [[CSA_MASK_SEL7]] = select i1 [[CSA_COND_ANYACTIVE5]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL8]] = select i1 [[CSA_COND_ANYACTIVE5]], <vscale x 4 x float> [[VP_OP_LOAD4]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP5]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; EVL:       middle.block:
+; EVL-NEXT:    [[CSA_STEP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP9]], <vscale x 4 x i1> [[CSA_MASK_SEL7]], i32 [[CSA_VL_SEL6]])
+; EVL-NEXT:    [[CSA_EXTRACT10:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL8]], i32 [[TMP15]]
+; EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[CSA_EXTRACT10]], float 1.000000e+00
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL:       scalar.ph:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; EVL:       exit.loopexit:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP21:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT:    br label [[EXIT]]
+; EVL:       exit:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP21]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    ret float [[ADD]]
+; EVL:       for.body:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; EVL-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP22]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP22]], float [[T_020]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; EVL-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP23]], 0.000000e+00
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP23]], float [[S_021]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_float_select(
+; NO-EVL-NEXT:  entry:
+; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; NO-EVL:       for.body.preheader:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-EVL:       vector.body:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; NO-EVL-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; NO-EVL-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; NO-EVL-NEXT:    [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; NO-EVL-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; NO-EVL-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; NO-EVL-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL:       scalar.ph:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-EVL:       exit.loopexit:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT:    br label [[EXIT]]
+; NO-EVL:       exit:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    ret float [[ADD]]
+; NO-EVL:       for.body:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+entry:
+  %cmp19 = icmp sgt i32 %N, 0
+  br i1 %cmp19, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.body
+  %0 = fadd float %t.1, %s.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
+  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
+  %1 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %1, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %1, float %t.020
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
+  %2 = load float, ptr %arrayidx5, align 4
+  %cmp6 = fcmp ogt float %2, 0.000000e+00
+  %s.1 = select i1 %cmp6, float %2, float %s.021
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_in_series_int(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %4 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
+;                           float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_in_series_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
+  %4 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int_select(int N, int *data0,
+;                                   int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (i < data0[i])
+;       t = data0[i];
+;     if (i < data1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_in_series_same_scalar_int_selectcsa_in_series_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = sext i32 %0 to i64
+  %cmp1 = icmp slt i64 %iv, %1
+  %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
+  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %2 = load i32, ptr %arrayidx5, align 4
+  %3 = sext i32 %2 to i64
+  %cmp6 = icmp slt i64 %iv, %3
+  %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float_select(int N,
+;                                       float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (0.0f < data0[i])
+;       t = data0[i];
+;     if (0.0f < data1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_in_series_same_scalar_float_select(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp19 = icmp sgt i32 %N, 0
+  br i1 %cmp19, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 0.000000e+00
+  %t.1 = select i1 %cmp1, float %0, float %t.020
+  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
+  %1 = load float, ptr %arrayidx5, align 4
+  %cmp6 = fcmp ogt float %1, 0.000000e+00
+  %t.2 = select i1 %cmp6, float %1, float %t.1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+;                                   int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_in_series_same_scalar_int(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %2 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %2, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %3 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
+;                                       float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_in_series_same_scalar_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
+  %1 = load float, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %2 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %2, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.end
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
+  %3 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then6
+  %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i]) {
+;       t = data0[i];
+;       s = data1[i];
+;     }
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_same_cond_int(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %3 = load i32, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
+  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond[i]) {
+;       t = data0[i];
+;       s = data1[i];
+;     }
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_same_cond_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
+  %2 = load float, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %iv
+  %3 = load float, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
+  %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+;                                 int *data1) {
+;   int t = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_else_if_same_scalar_int(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %1 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %1, 0
+  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split:                               ; preds = %if.else, %for.body
+  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.sink.split, %if.else
+  %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
+;                                     float *data0, float *data1) {
+;   float t = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       t = data1[i];
+;   }
+;   return t; // use t
+; }
+define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_else_if_same_scalar_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.inc, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+  ret float %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %1 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %1, 0
+  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split:                               ; preds = %if.else, %for.body
+  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.sink.split, %if.else
+  %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+;   int t = -1;
+;   int s = -1;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t | s; // use t and s
+; }
+define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_else_if_int(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.inc
+  %0 = or i32 %s.1, %t.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.else
+  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
+  %4 = load i32, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then6, %if.else
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+  %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
+;                         float *data1) {
+;   float t = 1.0f;
+;   float s = 1.0f;
+;   for (int i = 0; i < N; i++) {
+;     if (cond0[i])
+;       t = data0[i];
+;     else if (cond1[i])
+;       s = data1[i];
+;   }
+;   return t + s; // use t and s
+; }
+define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; CHECK-LABEL: @csa_else_if_float(
+; CHECK-NOT: vector.body:
+;
+entry:
+  %cmp15 = icmp sgt i32 %N, 0
+  br i1 %cmp15, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit.loopexit:                        ; preds = %for.inc
+  %0 = fadd float %t.1, %s.1
+  br label %exit
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
+  ret float %add
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
+  %2 = load float, ptr %arrayidx2, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
+  %3 = load i8, ptr %arrayidx4, align 1
+  %tobool5.not = icmp eq i8 %3, 0
+  br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6:                                         ; preds = %if.else
+  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
+  %4 = load float, ptr %arrayidx8, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then6, %if.else
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+  %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+}
 
 ; This function is generated from the following C/C++ program:
 ; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
@@ -15,83 +1881,7 @@
 ; }
 define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-LABEL: @idx_scalar(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
-; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP10]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; EVL-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP15]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP16]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP16]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP18]])
-; EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP23]]
-; EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; EVL-NEXT:    [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP26]], [[TMP27]]
-; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL-NOT: vector.body:
 ;
 ; NO-EVL-LABEL: @idx_scalar(
 ; NO-EVL-NEXT:  entry:
@@ -136,7 +1926,7 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; NO-EVL:       middle.block:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
@@ -170,7 +1960,7 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 entry:
   %cmp8.not = icmp eq i64 %n, 0
@@ -205,153 +1995,8 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   return idx;
 ; }
 define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar_dec(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT9]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; NO-EVL-LABEL: @idx_scalar_dec(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT9]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; NO-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; NO-EVL-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; NO-EVL-NEXT:    [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT:    [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; NO-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; NO-EVL-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; NO-EVL-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[SUB]] = add i64 [[I_011]], -1
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; NO-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-LABEL: @idx_scalar_dec(
+; CHECK-NOT: vector.body:
 ;
 entry:
   %cmp.not9 = icmp eq i64 %n, 0
@@ -379,172 +2024,135 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 ; This function is generated from the following C/C++ program:
-; int *simple_csa_ptr_select(int N, int **data) {
-;   int *t = nullptr;
+; The key part of this function is that the true arm of the select corresponds
+; to selecting the initial value, instead of selecting the new value.
+; int simple_csa_int_select_neg_cond(int N, int *data) {
+;   int t = 0;
 ;   for (int i = 0; i < N; i++) {
-;     if (a < *data[i])
+;     if (i != data[i])
 ;       t = data[i];
 ;   }
 ;   return t; // use t
 ; }
-define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_ptr_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP10]]
-; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds ptr, ptr [[TMP11]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP12]], align 8
-; EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP13]]
-; EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP14]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP17]])
-; EVL-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
-; EVL-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
-; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP22]]
-; EVL-NEXT:    [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
-; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], ptr [[CSA_EXTRACT]], ptr null
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; EVL-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP27]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP25]], ptr [[T_010]]
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NOT: vector.body:
 ;
-; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
 ; NO-EVL-NEXT:  entry:
 ; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; NO-EVL:       for.body.preheader:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-EVL:       vector.ph:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-EVL:       vector.body:
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP10]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds ptr, ptr [[TMP11]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP12]], align 8
-; NO-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; NO-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP13]]
-; NO-EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP14]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; NO-EVL-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP13]]
+; NO-EVL-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP17]])
-; NO-EVL-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
-; NO-EVL-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
-; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP22]]
-; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; NO-EVL-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; NO-EVL-NEXT:    [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP23]]
+; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT]], i32 0
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; NO-EVL:       scalar.ph:
 ; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    br label [[EXIT]]
 ; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
 ; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; NO-EVL-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP27]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP25]], ptr [[T_010]]
-; NO-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP26]] to i64
+; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP27]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP26]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %exit
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+exit:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = zext i32 %0 to i64
+  %cmp1.not = icmp eq i64 %iv, %1
+  %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+;   int *t = nullptr;
+;   for (int i = 0; i < N; i++) {
+;     if (a < *data[i])
+;       t = data[i];
+;   }
+;   return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
+; CHECK-LABEL: @idx_scalar_dec(
+; CHECK-NOT: vector.body:
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -559,15 +2167,15 @@ exit:
   ret ptr %t.0.lcssa
 
 for.body:
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %iv
   %0 = load ptr, ptr %arrayidx, align 8
   %1 = load i32, ptr %0, align 4
   %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %indvars.iv, %2
+  %cmp1 = icmp slt i64 %a, %2
   %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
   br i1 %exitcond.not, label %exit, label %for.body
 }
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
deleted file mode 100644
index 8f3259b2565e3d..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ /dev/null
@@ -1,2415 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
-; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
-; RUN:   | FileCheck %s -check-prefix=EVL
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
-; RUN:   -enable-csa-vectorization -scalable-vectorization=on \
-; RUN:   -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
-; RUN:   | FileCheck %s -check-prefix=NO-EVL
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int_select(int N, int *data, int a) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (a < data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
-; EVL-LABEL: @simple_csa_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
-; EVL-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
-; EVL-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
-; EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
-; EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
-; EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
-; NO-EVL-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
-; NO-EVL-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
-; NO-EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
-; NO-EVL-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %a, %1
-  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int_select_induction_cmp(int N, int *data) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (i < data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
-; EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
-; EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP14]])
-; EVL-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 0
-; EVL-NEXT:    [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
-; EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP19]]
-; EVL-NEXT:    [[TMP20:%.*]] = icmp sge i32 [[TMP19]], 0
-; EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP23]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP22]], i32 [[T_010]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
-; NO-EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP14]])
-; NO-EVL-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 0
-; NO-EVL-NEXT:    [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP19]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = icmp sge i32 [[TMP19]], 0
-; NO-EVL-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP23]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP22]], i32 [[T_010]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %iv, %1
-  %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float simple_csa_float_select(int N, float *data) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (0.0f < data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define float @simple_csa_float_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
-; EVL-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
-; EVL-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; EVL-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
-; EVL-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
-; EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
-; NO-EVL-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
-; NO-EVL-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
-; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
-; NO-EVL-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-entry:
-  %cmp8 = icmp sgt i32 %N, 0
-  br i1 %cmp8, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data, i64 %iv
-  %0 = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 0.000000e+00
-  %t.1 = select i1 %cmp1, float %0, float %t.09
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int(int N, bool *cond, int *data) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
-; EVL-LABEL: @simple_csa_int(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @simple_csa_int(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float simple_csa_float(int N, bool *cond, float *data) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
-; EVL-LABEL: @simple_csa_float(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @simple_csa_float(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %iv
-  %1 = load float, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (a < data0[i])
-;       t = data0[i];
-;     if (a < data1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
-; EVL-LABEL: @csa_in_series_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
-; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
-; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
-; EVL-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
-; EVL-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
-; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
-; EVL-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
-; EVL-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; NO-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT:    [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; NO-EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; NO-EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
-; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
-; NO-EVL-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
-; NO-EVL-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
-; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
-; NO-EVL-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP31]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.body
-  %0 = or i32 %s.1, %spec.select
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %1 = load i32, ptr %arrayidx, align 4
-  %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %a, %2
-  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %3 = load i32, ptr %arrayidx5, align 4
-  %4 = sext i32 %3 to i64
-  %cmp6 = icmp slt i64 %a, %4
-  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (i < data0[i])
-;       t = data0[i];
-;     if (i < data1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP7]]
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
-; EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
-; EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP7]]
-; EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP16]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP17]], <vscale x 1 x i1> [[TMP16]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP17]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP19]])
-; EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP24]]
-; EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT7]], i32 -1
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP27:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP27]])
-; EVL-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
-; EVL-NEXT:    [[TMP31:%.*]] = and i1 [[TMP29]], [[TMP30]]
-; EVL-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP32]]
-; EVL-NEXT:    [[TMP33:%.*]] = icmp sge i32 [[TMP32]], 0
-; EVL-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP35:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP35]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP37]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP36]], i32 [[T_022]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP39]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP38]], i32 [[S_023]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP7]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
-; NO-EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP7]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT:    [[TMP15:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP16]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP17]], <vscale x 1 x i1> [[TMP16]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP17]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP19:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP19]])
-; NO-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP24]]
-; NO-EVL-NEXT:    [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT7]], i32 -1
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP27:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP27]])
-; NO-EVL-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
-; NO-EVL-NEXT:    [[TMP31:%.*]] = and i1 [[TMP29]], [[TMP30]]
-; NO-EVL-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP32]]
-; NO-EVL-NEXT:    [[TMP33:%.*]] = icmp sge i32 [[TMP32]], 0
-; NO-EVL-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP35:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP35]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP37]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP36]], i32 [[T_022]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
-; NO-EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP39]]
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP38]], i32 [[S_023]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.body
-  %0 = or i32 %s.1, %spec.select
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %1 = load i32, ptr %arrayidx, align 4
-  %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %iv, %2
-  %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %3 = load i32, ptr %arrayidx5, align 4
-  %4 = sext i32 %3 to i64
-  %cmp6 = icmp slt i64 %iv, %4
-  %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_float_select(int N, float *data0,
-;                           float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (0.0f < data0[i])
-;       t = data0[i];
-;     if (0.0f <data1[i])
-;       s = data1[i];
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
-; EVL-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
-; EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
-; EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
-; EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
-; EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
-; EVL-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
-; EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT:    ret float [[ADD]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; NO-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; NO-EVL-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
-; NO-EVL-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
-; NO-EVL-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; NO-EVL-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; NO-EVL-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
-; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
-; NO-EVL-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; NO-EVL-NEXT:    [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; NO-EVL-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
-; NO-EVL-NEXT:    [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; NO-EVL-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP29]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT:    ret float [[ADD]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
-; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; NO-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
-; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-entry:
-  %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.body
-  %0 = fadd float %t.1, %s.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
-  %1 = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp ogt float %1, 0.000000e+00
-  %t.1 = select i1 %cmp1, float %1, float %t.020
-  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
-  %2 = load float, ptr %arrayidx5, align 4
-  %cmp6 = fcmp ogt float %2, 0.000000e+00
-  %s.1 = select i1 %cmp6, float %2, float %s.021
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_in_series_int(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.inc
-  %0 = or i32 %s.1, %t.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %4 = load i32, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
-;                           float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_in_series_float(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.inc
-  %0 = fadd float %t.1, %s.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
-  %2 = load float, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
-  %4 = load float, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_same_scalar_int_select(int N, int *data0,
-;                                   int *data1) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (i < data0[i])
-;       t = data0[i];
-;     if (i < data1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_int_select(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = sext i32 %0 to i64
-  %cmp1 = icmp slt i64 %iv, %1
-  %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
-  %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %2 = load i32, ptr %arrayidx5, align 4
-  %3 = sext i32 %2 to i64
-  %cmp6 = icmp slt i64 %iv, %3
-  %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_same_scalar_float_select(int N,
-;                                       float *data0, float *data1) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (0.0f < data0[i])
-;       t = data0[i];
-;     if (0.0f < data1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_float_select(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
-  %0 = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 0.000000e+00
-  %t.1 = select i1 %cmp1, float %0, float %t.020
-  %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
-  %1 = load float, ptr %arrayidx5, align 4
-  %cmp6 = fcmp ogt float %1, 0.000000e+00
-  %t.2 = select i1 %cmp6, float %1, float %t.1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
-;                                   int *data1) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_int(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %2 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %2, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %3 = load i32, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
-;                                       float *data0, float *data1) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_float(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
-  %1 = load float, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %2 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %2, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
-  %3 = load float, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end, %if.then6
-  %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i]) {
-;       t = data0[i];
-;       s = data1[i];
-;     }
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_same_cond_int(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_same_cond_int(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.inc
-  %0 = or i32 %s.1, %t.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %3 = load i32, ptr %arrayidx4, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
-  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond[i]) {
-;       t = data0[i];
-;       s = data1[i];
-;     }
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_same_cond_float(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_same_cond_float(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.inc
-  %0 = fadd float %t.1, %s.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
-  %2 = load float, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %iv
-  %3 = load float, ptr %arrayidx4, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
-  %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
-;                                 int *data1) {
-;   int t = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_same_scalar_int(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %1 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %1, 0
-  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
-
-for.inc.sink.split:                               ; preds = %if.else, %for.body
-  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.inc.sink.split, %if.else
-  %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
-;                                     float *data0, float *data1) {
-;   float t = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       t = data1[i];
-;   }
-;   return t; // use t
-; }
-define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_same_scalar_float(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.inc, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
-  ret float %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  br i1 %tobool.not, label %if.else, label %for.inc.sink.split
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %1 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %1, 0
-  br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
-
-for.inc.sink.split:                               ; preds = %if.else, %for.body
-  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
-  %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %iv
-  %2 = load float, ptr %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.inc.sink.split, %if.else
-  %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
-;   int t = -1;
-;   int s = -1;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t | s; // use t and s
-; }
-define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_int(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_else_if_int(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.inc
-  %0 = or i32 %s.1, %t.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
-  ret i32 %or
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.else, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
-  %2 = load i32, ptr %arrayidx2, align 4
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.else
-  %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
-  %4 = load i32, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then6, %if.else
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
-  %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
-;                         float *data1) {
-;   float t = 1.0f;
-;   float s = 1.0f;
-;   for (int i = 0; i < N; i++) {
-;     if (cond0[i])
-;       t = data0[i];
-;     else if (cond1[i])
-;       s = data1[i];
-;   }
-;   return t + s; // use t and s
-; }
-define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_float(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @csa_else_if_float(
-; EVL-NOT: vector.body:
-;
-entry:
-  %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit.loopexit:                        ; preds = %for.inc
-  %0 = fadd float %t.1, %s.1
-  br label %exit
-
-exit:                                 ; preds = %exit.loopexit, %entry
-  %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
-  ret float %add
-
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %tobool.not = icmp eq i8 %1, 0
-  br i1 %tobool.not, label %if.else, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
-  %2 = load float, ptr %arrayidx2, align 4
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
-  %3 = load i8, ptr %arrayidx4, align 1
-  %tobool5.not = icmp eq i8 %3, 0
-  br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6:                                         ; preds = %if.else
-  %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
-  %4 = load float, ptr %arrayidx8, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then6, %if.else
-  %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
-  %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
-;   uint64_t idx = ii;
-;   for (uint64_t i = 0; i < n; ++i)
-;     idx = (a[i] > b[i]) ? i : idx;
-;   return idx;
-; }
-define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @idx_scalar(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp8.not = icmp eq i64 %n, 0
-  br i1 %cmp8.not, label %exit, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
-  ret i64 %idx.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
-  %0 = load i64, ptr %arrayidx, align 8
-  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
-  %1 = load i64, ptr %arrayidx1, align 8
-  %cmp2 = icmp sgt i64 %0, %1
-  %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
-  %inc = add nuw i64 %i.010, 1
-  %exitcond.not = icmp eq i64 %inc, %n
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
-;   uint64_t idx = ii;
-;   for (uint64_t i = n; i > 0; --i) // decreasing
-;      idx = (a[i - 1] > b[i - 1]) ? i : idx;
-;   return idx;
-; }
-define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar_dec(
-; EVL-NOT: vector.body:
-;
-; NO-EVL-LABEL: @idx_scalar_dec(
-; NO-EVL-NOT: vector.body:
-;
-entry:
-  %cmp.not9 = icmp eq i64 %n, 0
-  br i1 %cmp.not9, label %exit, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
-  ret i64 %idx.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
-  %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
-  %sub = add i64 %i.011, -1
-  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
-  %0 = load i64, ptr %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
-  %1 = load i64, ptr %arrayidx2, align 8
-  %cmp3 = icmp sgt i64 %0, %1
-  %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
-  %cmp.not = icmp eq i64 %sub, 0
-  br i1 %cmp.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; The key part of this function is that the true arm of the select corresponds
-; to selecting the initial value, instead of selecting the new value.
-; int simple_csa_int_select_neg_cond(int N, int *data) {
-;   int t = 0;
-;   for (int i = 0; i < N; i++) {
-;     if (i != data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_neg_cond(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
-; EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
-; EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
-; EVL-NEXT:    [[TMP10:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
-; EVL-NEXT:    [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; EVL:       middle.block:
-; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 0
-; EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i64
-; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP24]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP23]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
-; NO-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; NO-EVL-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; NO-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP2]]
-; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
-; NO-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
-; NO-EVL-NEXT:    [[TMP10:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; NO-EVL-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; NO-EVL:       middle.block:
-; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; NO-EVL-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; NO-EVL-NEXT:    [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; NO-EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 0
-; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i64
-; NO-EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP24]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP23]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ]
-  ret i32 %t.0.lcssa
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %1 = zext i32 %0 to i64
-  %cmp1.not = icmp eq i64 %iv, %1
-  %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int *simple_csa_ptr_select(int N, int **data) {
-;   int *t = nullptr;
-;   for (int i = 0; i < N; i++) {
-;     if (a < *data[i])
-;       t = data[i];
-;   }
-;   return t; // use t
-; }
-define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
-; EVL-LABEL: @simple_csa_ptr_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
-; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT:%.*]] ]
-; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[IV]]
-; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @simple_csa_ptr_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
-; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[IV]]
-; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
-  %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
-
-for.body.preheader:
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-exit:
-  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %for.body ]
-  ret ptr %t.0.lcssa
-
-for.body:
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
-  %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %iv
-  %0 = load ptr, ptr %arrayidx, align 8
-  %1 = load i32, ptr %0, align 4
-  %2 = sext i32 %1 to i64
-  %cmp1 = icmp slt i64 %a, %2
-  %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
-}

>From f4228d012fd1d176c00d73dbd432d3cac783ea39 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 27 Sep 2024 10:36:30 -0700
Subject: [PATCH 20/23] fixup! fix tests

---
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |    1 -
 .../RISCV/conditional-scalar-assignment.ll    | 1413 +++++++++--------
 2 files changed, 749 insertions(+), 665 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index ad47879c4c20b1..4621c28b051298 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -115,4 +115,3 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
         return false;
       });
 }
-
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index cd6a55fba46032..26e262c7c302f7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN:   -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
 ; RUN:   -prefer-predicate-over-epilogue=predicate-dont-vectorize  \
-; RUN:   -force-tail-folding-style=data-with-evl -enable-csa-vectorization \
-; RUN:   | FileCheck %s -check-prefixes=CHECK,EVL
+; RUN:   -enable-csa-vectorization | FileCheck %s -check-prefixes=CHECK,EVL
 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
 ; RUN:   -force-tail-folding-style=none -enable-csa-vectorization \
 ; RUN:   | FileCheck %s -check-prefixes=CHECK,NO-EVL
@@ -17,35 +17,36 @@
 ;   return t; // use t
 ; }
 define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
-; EVL-LABEL: @simple_csa_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
+; EVL-LABEL: define i32 @simple_csa_int_select(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP17]], 1
 ; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP17]]
 ; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A]], i64 0
 ; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP6]]
 ; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x i64>
@@ -58,61 +59,62 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL:       middle.block:
+; EVL-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
 ; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP13]]
 ; EVL-NEXT:    [[TMP14:%.*]] = icmp sge i32 [[TMP13]], 0
 ; EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT]]:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP17]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP16]], i32 [[T_010]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
-; NO-EVL-LABEL: @simple_csa_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define i32 @simple_csa_int_select(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A]], i64 0
 ; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
@@ -122,8 +124,8 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
@@ -135,19 +137,19 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; NO-EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
 ; NO-EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
@@ -155,23 +157,23 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
+  br i1 %cmp9, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %loop ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.010 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ]
   %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = sext i32 %0 to i64
@@ -179,7 +181,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -195,17 +197,18 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; EVL-LABEL: @simple_csa_int_select_induction_cmp(
 ; EVL-NOT: vector.body:
 ;
-; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define i32 @simple_csa_int_select_induction_cmp(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
@@ -219,14 +222,14 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
 ; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
@@ -237,8 +240,8 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP17]])
@@ -250,19 +253,19 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
 ; NO-EVL-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
@@ -270,23 +273,23 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP25]], i32 [[T_010]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
+  br i1 %cmp9, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %loop ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.010 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ]
   %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = sext i32 %0 to i64
@@ -294,7 +297,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -307,33 +310,34 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   return t; // use t
 ; }
 define float @simple_csa_float_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
+; EVL-LABEL: define float @simple_csa_float_select(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP8]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP15]], 4
 ; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
 ; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[TMP6]]
 ; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[VP_OP_LOAD]], zeroinitializer
@@ -345,58 +349,59 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; EVL:       middle.block:
+; EVL-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
 ; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
 ; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP12]]
 ; EVL-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 0
 ; EVL-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT]]:
+; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[LOOP]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[T_1_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; EVL-NEXT:    ret float [[T_0_LCSSA]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, %[[SCALAR_PH]] ], [ [[T_1]], %[[LOOP]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP15]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP15]], float [[T_09]]
+; EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
-; NO-EVL-LABEL: @simple_csa_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define float @simple_csa_float_select(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
@@ -405,8 +410,8 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
@@ -418,49 +423,49 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
 ; NO-EVL-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[LOOP]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[T_1_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret float [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_09:%.*]] = phi float [ 1.000000e+00, %[[SCALAR_PH]] ], [ [[T_1]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
 ; NO-EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
-  br i1 %cmp8, label %for.body.preheader, label %exit
+  br i1 %cmp8, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %loop ]
   ret float %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.09 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %loop ]
   %arrayidx = getelementptr inbounds float, ptr %data, i64 %iv
   %0 = load float, ptr %arrayidx, align 4
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %t.1 = select i1 %cmp1, float %0, float %t.09
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -478,34 +483,34 @@ define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
 ;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %exit
+  br i1 %cmp6, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %t.07 = phi i32 [ -1, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %iv
   %1 = load i32, ptr %arrayidx2, align 4
   br label %for.inc
 
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
+for.inc:                                          ; preds = %loop, %if.then
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %loop ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -523,34 +528,34 @@ define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
 ;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
-  br i1 %cmp6, label %for.body.preheader, label %exit
+  br i1 %cmp6, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
   ret float %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %t.07 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %iv
   %1 = load float, ptr %arrayidx2, align 4
   br label %for.inc
 
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
+for.inc:                                          ; preds = %loop, %if.then
+  %t.1 = phi float [ %1, %if.then ], [ %t.07, %loop ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -566,38 +571,39 @@ for.inc:                                          ; preds = %for.body, %if.then
 ;   return t | s; // use t and s
 ; }
 define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
-; EVL-LABEL: @csa_in_series_int_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
+; EVL-LABEL: define i32 @csa_in_series_int_select(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]], i64 [[A:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 4
+; EVL-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP25]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP24]]
 ; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A]], i64 0
 ; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[TMP6]]
 ; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x i64>
@@ -606,7 +612,7 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI3]]
 ; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
 ; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[TMP6]]
 ; EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD4]] to <vscale x 4 x i64>
@@ -617,10 +623,10 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-NEXT:    [[CSA_DATA_SEL8]] = select i1 [[CSA_COND_ANYACTIVE5]], <vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
 ; EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP5]] to i64
 ; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]]
 ; EVL-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; EVL:       middle.block:
+; EVL-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
 ; EVL-NEXT:    [[CSA_STEP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP9]], <vscale x 4 x i1> [[CSA_MASK_SEL7]], i32 [[CSA_VL_SEL6]])
 ; EVL-NEXT:    [[CSA_EXTRACT10:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL8]], i32 [[TMP17]]
@@ -631,64 +637,65 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
 ; EVL-NEXT:    [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
 ; EVL-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP23:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP23]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT:    br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT]]:
+; EVL-NEXT:    [[SPEC_SELECT:%.*]] = phi i32 [ [[SPEC_SELECT1:%.*]], %[[LOOP]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1:%.*]] = phi i32 [ [[S_2:%.*]], %[[LOOP]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1]], [[SPEC_SELECT]]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
 ; EVL-NEXT:    ret i32 [[OR]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[S_2]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT1]], %[[LOOP]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
-; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP25]]
-; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP24]], i32 [[T_022]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT1]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
 ; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP27]]
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP26]], i32 [[S_023]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; EVL-NEXT:    [[S_2]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
-; NO-EVL-LABEL: @csa_in_series_int_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define i32 @csa_in_series_int_select(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]], i64 [[A:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A]], i64 0
 ; NO-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
@@ -696,7 +703,7 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
 ; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
 ; NO-EVL-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
@@ -706,8 +713,8 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
@@ -729,22 +736,22 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
 ; NO-EVL-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP33]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], %[[LOOP]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP34]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[S_1]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
@@ -757,17 +764,17 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %exit
+  br i1 %cmp21, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit.loopexit:                        ; preds = %for.body
+exit.loopexit:                        ; preds = %loop
   %0 = or i32 %s.1, %spec.select
   br label %exit
 
@@ -775,10 +782,10 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %s.023 = phi i32 [ -1, %loop.preheader ], [ %s.1, %loop ]
+  %t.022 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ]
   %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
   %1 = load i32, ptr %arrayidx, align 4
   %2 = sext i32 %1 to i64
@@ -791,7 +798,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -810,17 +817,18 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
 ; EVL-NOT: vector.body:
 ;
-; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define i32 @csa_in_series_int_select_induction_cmp(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP21]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
@@ -834,16 +842,16 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
 ; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
@@ -851,7 +859,7 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP14]])
 ; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
@@ -862,8 +870,8 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP22]])
@@ -885,22 +893,22 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[TMP36:%.*]] = icmp sge i32 [[TMP35]], 0
 ; NO-EVL-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[CSA_EXTRACT]], i32 -1
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP37]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[TMP38:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP38]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP38]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
 ; NO-EVL-NEXT:    ret i32 [[OR]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[S_1]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
@@ -913,17 +921,17 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP41]], i32 [[S_023]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %exit
+  br i1 %cmp21, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit.loopexit:                        ; preds = %for.body
+exit.loopexit:                        ; preds = %loop
   %0 = or i32 %s.1, %spec.select
   br label %exit
 
@@ -931,10 +939,10 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %s.023 = phi i32 [ -1, %loop.preheader ], [ %s.1, %loop ]
+  %t.022 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ]
   %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
   %1 = load i32, ptr %arrayidx, align 4
   %2 = sext i32 %1 to i64
@@ -947,7 +955,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -964,36 +972,37 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   return t + s; // use t and s
 ; }
 define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float_select(
-; EVL-NEXT:  entry:
-; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; EVL:       for.body.preheader:
+; EVL-LABEL: define float @csa_in_series_float_select(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP19]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
 ; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL:       vector.ph:
-; EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; EVL-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP22]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP23]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP22]]
 ; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; EVL:       vector.body:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[TMP6]]
 ; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[VP_OP_LOAD]], zeroinitializer
@@ -1001,7 +1010,7 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI3]]
 ; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
 ; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[TMP6]]
 ; EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
 ; EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; EVL-NEXT:    [[TMP12:%.*]] = fcmp ogt <vscale x 4 x float> [[VP_OP_LOAD4]], zeroinitializer
@@ -1013,8 +1022,8 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; EVL:       middle.block:
+; EVL-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
 ; EVL-NEXT:    [[CSA_STEP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -1, <vscale x 4 x i32> [[CSA_STEP9]], <vscale x 4 x i1> [[CSA_MASK_SEL7]], i32 [[CSA_VL_SEL6]])
 ; EVL-NEXT:    [[CSA_EXTRACT10:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL8]], i32 [[TMP15]]
@@ -1025,67 +1034,68 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP18]]
 ; EVL-NEXT:    [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
 ; EVL-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL:       scalar.ph:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; EVL:       exit.loopexit:
-; EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT:    [[TMP21:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT:    br label [[EXIT]]
-; EVL:       exit:
-; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP21]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT:    br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT]]:
+; EVL-NEXT:    [[T_1:%.*]] = phi float [ [[T_2:%.*]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[S_1:%.*]] = phi float [ [[S_2:%.*]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = fadd float [[T_1]], [[S_1]]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ 2.000000e+00, %[[ENTRY]] ]
 ; EVL-NEXT:    ret float [[ADD]]
-; EVL:       for.body:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, %[[SCALAR_PH]] ], [ [[S_2]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, %[[SCALAR_PH]] ], [ [[T_2]], %[[LOOP]] ]
 ; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP22]], 0.000000e+00
-; EVL-NEXT:    [[T_1]] = select i1 [[CMP1]], float [[TMP22]], float [[T_020]]
+; EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; EVL-NEXT:    [[T_2]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
 ; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP23]], 0.000000e+00
-; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP23]], float [[S_021]]
+; EVL-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; EVL-NEXT:    [[S_2]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
 ; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
-; NO-EVL-LABEL: @csa_in_series_float_select(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define float @csa_in_series_float_select(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP19]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; NO-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; NO-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
 ; NO-EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
@@ -1094,8 +1104,8 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; NO-EVL-NEXT:    [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
@@ -1117,22 +1127,22 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; NO-EVL-NEXT:    [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
 ; NO-EVL-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[LOOP]] ], [ [[TMP31]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], %[[LOOP]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ]
 ; NO-EVL-NEXT:    [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[ADD:%.*]] = phi float [ [[TMP32]], %[[EXIT_LOOPEXIT]] ], [ 2.000000e+00, %[[ENTRY]] ]
 ; NO-EVL-NEXT:    ret float [[ADD]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[S_021:%.*]] = phi float [ 1.000000e+00, %[[SCALAR_PH]] ], [ [[S_1]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, %[[SCALAR_PH]] ], [ [[T_1]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
@@ -1143,17 +1153,17 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; NO-EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 entry:
   %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %exit
+  br i1 %cmp19, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit.loopexit:                        ; preds = %for.body
+exit.loopexit:                        ; preds = %loop
   %0 = fadd float %t.1, %s.1
   br label %exit
 
@@ -1161,10 +1171,10 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
-  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %s.021 = phi float [ 1.000000e+00, %loop.preheader ], [ %s.1, %loop ]
+  %t.020 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %loop ]
   %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
   %1 = load float, ptr %arrayidx, align 4
   %cmp1 = fcmp ogt float %1, 0.000000e+00
@@ -1175,7 +1185,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %s.1 = select i1 %cmp6, float %2, float %s.021
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1196,11 +1206,11 @@ define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %d
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit.loopexit:                        ; preds = %for.inc
   %0 = or i32 %s.1, %t.1
@@ -1210,22 +1220,22 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %loop.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   br label %if.end
 
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
+if.end:                                           ; preds = %if.then, %loop
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %loop ]
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
@@ -1240,7 +1250,7 @@ for.inc:                                          ; preds = %if.end, %if.then6
   %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1262,11 +1272,11 @@ define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, pt
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit.loopexit:                        ; preds = %for.inc
   %0 = fadd float %t.1, %s.1
@@ -1276,22 +1286,22 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %loop.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   br label %if.end
 
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
+if.end:                                           ; preds = %if.then, %loop
+  %t.1 = phi float [ %2, %if.then ], [ %t.016, %loop ]
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
@@ -1306,7 +1316,7 @@ for.inc:                                          ; preds = %if.end, %if.then6
   %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1322,24 +1332,24 @@ for.inc:                                          ; preds = %if.end, %if.then6
 ;   return t; // use t
 ; }
 define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_same_scalar_int_selectcsa_in_series_float(
+; CHECK-LABEL: @csa_in_series_same_scalar_int_select(
 ; CHECK-NOT: vector.body:
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.body.preheader, label %exit
+  br i1 %cmp21, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %loop ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.022 = phi i32 [ -1, %loop.preheader ], [ %t.2, %loop ]
   %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = sext i32 %0 to i64
@@ -1352,7 +1362,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1373,19 +1383,19 @@ define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %da
 ;
 entry:
   %cmp19 = icmp sgt i32 %N, 0
-  br i1 %cmp19, label %for.body.preheader, label %exit
+  br i1 %cmp19, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %loop ]
   ret float %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.020 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.2, %loop ]
   %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
   %0 = load float, ptr %arrayidx, align 4
   %cmp1 = fcmp ogt float %0, 0.000000e+00
@@ -1396,7 +1406,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %t.2 = select i1 %cmp6, float %1, float %t.1
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1417,31 +1427,31 @@ define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %d
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %loop.preheader ], [ %t.2, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %1 = load i32, ptr %arrayidx2, align 4
   br label %if.end
 
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
+if.end:                                           ; preds = %if.then, %loop
+  %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %loop ]
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %2 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %2, 0
@@ -1456,7 +1466,7 @@ for.inc:                                          ; preds = %if.end, %if.then6
   %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1477,31 +1487,31 @@ define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, pt
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
   ret float %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.2, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %1 = load float, ptr %arrayidx2, align 4
   br label %if.end
 
-if.end:                                           ; preds = %if.then, %for.body
-  %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
+if.end:                                           ; preds = %if.then, %loop
+  %t.1 = phi float [ %1, %if.then ], [ %t.016, %loop ]
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %2 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %2, 0
@@ -1516,7 +1526,7 @@ for.inc:                                          ; preds = %if.end, %if.then6
   %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1537,11 +1547,11 @@ define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
+  br i1 %cmp9, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit.loopexit:                        ; preds = %for.inc
   %0 = or i32 %s.1, %t.1
@@ -1551,28 +1561,28 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %s.011 = phi i32 [ -1, %loop.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi i32 [ -1, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %iv
   %3 = load i32, ptr %arrayidx4, align 4
   br label %for.inc
 
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
-  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
+for.inc:                                          ; preds = %loop, %if.then
+  %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %loop ]
+  %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %loop ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1593,11 +1603,11 @@ define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
+  br i1 %cmp9, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit.loopexit:                        ; preds = %for.inc
   %0 = fadd float %t.1, %s.1
@@ -1607,28 +1617,28 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %s.011 = phi float [ 1.000000e+00, %loop.preheader ], [ %s.1, %for.inc ]
+  %t.010 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %for.inc, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %iv
   %3 = load float, ptr %arrayidx4, align 4
   br label %for.inc
 
-for.inc:                                          ; preds = %for.body, %if.then
-  %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
-  %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
+for.inc:                                          ; preds = %loop, %if.then
+  %t.1 = phi float [ %2, %if.then ], [ %t.010, %loop ]
+  %s.1 = phi float [ %3, %if.then ], [ %s.011, %loop ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1649,32 +1659,32 @@ define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %dat
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi i32 [ -1, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.else, label %for.inc.sink.split
 
-if.else:                                          ; preds = %for.body
+if.else:                                          ; preds = %loop
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %1 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %1, 0
   br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
 
-for.inc.sink.split:                               ; preds = %if.else, %for.body
-  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+for.inc.sink.split:                               ; preds = %if.else, %loop
+  %data0.sink = phi ptr [ %data0, %loop ], [ %data1, %if.else ]
   %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   br label %for.inc
@@ -1683,7 +1693,7 @@ for.inc:                                          ; preds = %for.inc.sink.split,
   %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1704,32 +1714,32 @@ define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:                                 ; preds = %for.inc, %entry
   %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
   ret float %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %0 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %0, 0
   br i1 %tobool.not, label %if.else, label %for.inc.sink.split
 
-if.else:                                          ; preds = %for.body
+if.else:                                          ; preds = %loop
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %1 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %1, 0
   br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
 
-for.inc.sink.split:                               ; preds = %if.else, %for.body
-  %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+for.inc.sink.split:                               ; preds = %if.else, %loop
+  %data0.sink = phi ptr [ %data0, %loop ], [ %data1, %if.else ]
   %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   br label %for.inc
@@ -1738,7 +1748,7 @@ for.inc:                                          ; preds = %for.inc.sink.split,
   %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1759,11 +1769,11 @@ define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %dat
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit.loopexit:                        ; preds = %for.inc
   %0 = or i32 %s.1, %t.1
@@ -1773,21 +1783,21 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
   ret i32 %or
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi i32 [ -1, %loop.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi i32 [ -1, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.else, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
   %2 = load i32, ptr %arrayidx2, align 4
   br label %for.inc
 
-if.else:                                          ; preds = %for.body
+if.else:                                          ; preds = %loop
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
@@ -1803,7 +1813,7 @@ for.inc:                                          ; preds = %if.then, %if.then6,
   %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1825,11 +1835,11 @@ define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
-  br i1 %cmp15, label %for.body.preheader, label %exit
+  br i1 %cmp15, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit.loopexit:                        ; preds = %for.inc
   %0 = fadd float %t.1, %s.1
@@ -1839,21 +1849,21 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
   ret float %add
 
-for.body:                                         ; preds = %for.body.preheader, %for.inc
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
-  %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
-  %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+loop:                                         ; preds = %loop.preheader, %for.inc
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %for.inc ]
+  %s.017 = phi float [ 1.000000e+00, %loop.preheader ], [ %s.1, %for.inc ]
+  %t.016 = phi float [ 1.000000e+00, %loop.preheader ], [ %t.1, %for.inc ]
   %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %tobool.not = icmp eq i8 %1, 0
   br i1 %tobool.not, label %if.else, label %if.then
 
-if.then:                                          ; preds = %for.body
+if.then:                                          ; preds = %loop
   %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
   %2 = load float, ptr %arrayidx2, align 4
   br label %for.inc
 
-if.else:                                          ; preds = %for.body
+if.else:                                          ; preds = %loop
   %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
   %3 = load i8, ptr %arrayidx4, align 1
   %tobool5.not = icmp eq i8 %3, 0
@@ -1869,7 +1879,7 @@ for.inc:                                          ; preds = %if.then, %if.then6,
   %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit.loopexit, label %for.body
+  br i1 %exitcond.not, label %exit.loopexit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -1883,16 +1893,17 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; EVL-LABEL: @idx_scalar(
 ; EVL-NOT: vector.body:
 ;
-; NO-EVL-LABEL: @idx_scalar(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define i64 @idx_scalar(
+; NO-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP8_NOT]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
@@ -1906,17 +1917,17 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
 ; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
-; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
 ; NO-EVL-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
@@ -1926,8 +1937,8 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP18]])
@@ -1937,21 +1948,21 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
 ; NO-EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP23]]
 ; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II]]
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[II]], %[[SCALAR_PH]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
 ; NO-EVL-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; NO-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
@@ -1960,22 +1971,22 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
 ; NO-EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ;
 entry:
   %cmp8.not = icmp eq i64 %n, 0
-  br i1 %cmp8.not, label %exit, label %for.body.preheader
+  br i1 %cmp8.not, label %exit, label %loop.preheader
 
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
+loop.preheader:                               ; preds = %entry
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %loop ]
   ret i64 %idx.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %i.010 = phi i64 [ %inc, %loop ], [ 0, %loop.preheader ]
+  %idx.09 = phi i64 [ %cond, %loop ], [ %ii, %loop.preheader ]
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
   %0 = load i64, ptr %arrayidx, align 8
   %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
@@ -1984,7 +1995,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
   %inc = add nuw i64 %i.010, 1
   %exitcond.not = icmp eq i64 %inc, %n
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2000,18 +2011,18 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ;
 entry:
   %cmp.not9 = icmp eq i64 %n, 0
-  br i1 %cmp.not9, label %exit, label %for.body.preheader
+  br i1 %cmp.not9, label %exit, label %loop.preheader
 
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
+loop.preheader:                               ; preds = %entry
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %loop ]
   ret i64 %idx.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
-  %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %i.011 = phi i64 [ %sub, %loop ], [ %n, %loop.preheader ]
+  %idx.010 = phi i64 [ %cond, %loop ], [ %ii, %loop.preheader ]
   %sub = add i64 %i.011, -1
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
   %0 = load i64, ptr %arrayidx, align 8
@@ -2020,7 +2031,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %cmp3 = icmp sgt i64 %0, %1
   %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
   %cmp.not = icmp eq i64 %sub, 0
-  br i1 %cmp.not, label %exit, label %for.body
+  br i1 %cmp.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2035,20 +2046,21 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   return t; // use t
 ; }
 define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
-; EVL-LABEL: @idx_scalar_dec(
+; EVL-LABEL: @simple_csa_int_select_neg_cond(
 ; EVL-NOT: vector.body:
 ;
-; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
-; NO-EVL-NEXT:  entry:
-; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
-; NO-EVL:       for.body.preheader:
+; NO-EVL-LABEL: define i32 @simple_csa_int_select_neg_cond(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
 ; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; NO-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; NO-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL:       vector.ph:
+; NO-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-EVL:       [[VECTOR_PH]]:
 ; NO-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; NO-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
@@ -2062,14 +2074,14 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
 ; NO-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NO-EVL:       vector.body:
-; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-EVL:       [[VECTOR_BODY]]:
+; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
 ; NO-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
 ; NO-EVL-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
@@ -2081,8 +2093,8 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-EVL-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; NO-EVL:       middle.block:
+; NO-EVL-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-EVL:       [[MIDDLE_BLOCK]]:
 ; NO-EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
 ; NO-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
@@ -2094,19 +2106,19 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
 ; NO-EVL-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT]], i32 0
 ; NO-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL:       scalar.ph:
-; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-EVL:       exit.loopexit:
-; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT:    br label [[EXIT]]
-; NO-EVL:       exit:
-; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-EVL:       [[SCALAR_PH]]:
+; NO-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT]]:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
 ; NO-EVL-NEXT:    ret i32 [[T_0_LCSSA]]
-; NO-EVL:       for.body:
-; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
 ; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
 ; NO-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP26]] to i64
@@ -2114,23 +2126,23 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP26]]
 ; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
+  br i1 %cmp9, label %loop.preheader, label %exit
 
-for.body.preheader:                               ; preds = %entry
+loop.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
-exit:                                 ; preds = %for.body, %entry
-  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ]
+exit:                                 ; preds = %loop, %entry
+  %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %loop ]
   ret i32 %t.0.lcssa
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+loop:                                         ; preds = %loop.preheader, %loop
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.010 = phi i32 [ 0, %loop.preheader ], [ %spec.select, %loop ]
   %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
   %1 = zext i32 %0 to i64
@@ -2138,7 +2150,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }
 
 ; This function is generated from the following C/C++ program:
@@ -2151,24 +2163,97 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   return t; // use t
 ; }
 define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
-; CHECK-LABEL: @idx_scalar_dec(
-; CHECK-NOT: vector.body:
+; EVL-LABEL: define ptr @simple_csa_ptr_select(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; EVL-NEXT:    [[TMP18:%.*]] = sub i64 [[TMP17]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP18]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP17]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0
+; EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; EVL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[TMP6]]
+; EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i32 0
+; EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x ptr> @llvm.vp.load.nxv2p0.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[VP_OP_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[TMP9:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT:    [[CSA_COND_ANYACTIVE:%.*]] = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT:    [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI]]
+; EVL-NEXT:    [[CSA_MASK_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT:    [[CSA_DATA_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], <vscale x 2 x ptr> [[VP_OP_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP5]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
+; EVL-NEXT:    [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vp.reduce.smax.nxv2i32(i32 -1, <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i1> [[CSA_MASK_SEL]], i32 [[CSA_VL_SEL]])
+; EVL-NEXT:    [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP13]]
+; EVL-NEXT:    [[TMP14:%.*]] = icmp sge i32 [[TMP13]], 0
+; EVL-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT:    br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT]]:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NOT: vector.body:
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
-  br i1 %cmp9, label %for.body.preheader, label %exit
+  br i1 %cmp9, label %loop.preheader, label %exit
 
-for.body.preheader:
+loop.preheader:
   %wide.trip.count = zext i32 %N to i64
-  br label %for.body
+  br label %loop
 
 exit:
-  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %for.body ]
+  %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %loop ]
   ret ptr %t.0.lcssa
 
-for.body:
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+loop:
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %t.010 = phi ptr [ null, %loop.preheader ], [ %spec.select, %loop ]
   %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %iv
   %0 = load ptr, ptr %arrayidx, align 8
   %1 = load i32, ptr %0, align 4
@@ -2177,5 +2262,5 @@ for.body:
   %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %loop
 }

>From 1b54adee101d71fca02b08c3bd5047d17b33d2d7 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 11 Oct 2024 10:39:30 -0700
Subject: [PATCH 21/23] fixup! merge CSADescriptor into IVDescriptor file;
 Update return type of isCSAPhi

---
 llvm/include/llvm/Analysis/CSADescriptors.h   | 78 -------------------
 llvm/include/llvm/Analysis/IVDescriptors.h    | 58 +++++++++++++-
 .../Vectorize/LoopVectorizationLegality.h     |  1 -
 llvm/lib/Analysis/CMakeLists.txt              |  1 -
 llvm/lib/Analysis/CSADescriptors.cpp          | 74 ------------------
 llvm/lib/Analysis/IVDescriptors.cpp           | 58 +++++++++++++-
 .../Vectorize/LoopVectorizationLegality.cpp   |  3 +-
 7 files changed, 116 insertions(+), 157 deletions(-)
 delete mode 100644 llvm/include/llvm/Analysis/CSADescriptors.h
 delete mode 100644 llvm/lib/Analysis/CSADescriptors.cpp

diff --git a/llvm/include/llvm/Analysis/CSADescriptors.h b/llvm/include/llvm/Analysis/CSADescriptors.h
deleted file mode 100644
index 75372c1ba93d8a..00000000000000
--- a/llvm/include/llvm/Analysis/CSADescriptors.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//===------------- CSADescriptors.h - CSA Descriptors -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file "describes" conditional scalar assignments (CSA).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Value.h"
-
-#ifndef LLVM_ANALYSIS_CSADESCRIPTORS_H
-#define LLVM_ANALYSIS_CSADESCRIPTORS_H
-
-namespace llvm {
-
-/// A Conditional Scalar Assignment (CSA) is an assignment from an initial
-/// scalar that may or may not occur.
-class CSADescriptor {
-  /// If the conditional assignment occurs inside a loop, then Phi chooses
-  /// the value of the assignment from the entry block or the loop body block.
-  PHINode *Phi = nullptr;
-
-  /// The initial value of the CSA. If the condition guarding the assignment is
-  /// not met, then the assignment retains this value.
-  Value *InitScalar = nullptr;
-
-  /// The Instruction that conditionally assigned to inside the loop.
-  Instruction *Assignment = nullptr;
-
-  /// Create a CSA Descriptor that models an invalid CSA.
-  CSADescriptor() = default;
-
-  /// Create a CSA Descriptor that models a valid CSA with its members
-  /// initialized correctly.
-  CSADescriptor(PHINode *Phi, Instruction *Assignment, Value *InitScalar)
-      : Phi(Phi), InitScalar(InitScalar), Assignment(Assignment) {}
-
-public:
-  /// If Phi is the root of a CSA, return the CSADescriptor of the CSA rooted by
-  /// Phi. Otherwise, return a CSADescriptor with IsValidCSA set to false.
-  static CSADescriptor isCSAPhi(PHINode *Phi, Loop *TheLoop);
-
-  operator bool() const { return isValid(); }
-
-  /// Returns whether SI is the Assignment in CSA
-  static bool isCSASelect(CSADescriptor Desc, SelectInst *SI) {
-    return Desc.getAssignment() == SI;
-  }
-
-  /// Return whether this CSADescriptor models a valid CSA.
-  bool isValid() const { return Phi && InitScalar && Assignment; }
-
-  /// Return the PHI that roots this CSA.
-  PHINode *getPhi() const { return Phi; }
-
-  /// Return the initial value of the CSA. This is the value if the conditional
-  /// assignment does not occur.
-  Value *getInitScalar() const { return InitScalar; }
-
-  /// The Instruction that is used after the loop
-  Instruction *getAssignment() const { return Assignment; }
-
-  /// Return the condition that this CSA is conditional upon.
-  Value *getCond() const {
-    if (auto *SI = dyn_cast_or_null<SelectInst>(Assignment))
-      return SI->getCondition();
-    return nullptr;
-  }
-};
-} // namespace llvm
-
-#endif // LLVM_ANALYSIS_CSADESCRIPTORS_H
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 5d992faf99d27f..e0508998e0f1b0 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file "describes" induction and recurrence variables.
+// This file "describes" induction, recurrence, and conditional scalar
+// assignment (CSA) variables.
 //
 //===----------------------------------------------------------------------===//
 
@@ -391,6 +392,61 @@ class InductionDescriptor {
   SmallVector<Instruction *, 2> RedundantCasts;
 };
 
+/// A Conditional Scalar Assignment (CSA) is an assignment from an initial
+/// scalar that may or may not occur.
+class CSADescriptor {
+  /// If the conditional assignment occurs inside a loop, then Phi chooses
+  /// the value of the assignment from the entry block or the loop body block.
+  PHINode *Phi = nullptr;
+
+  /// The initial value of the CSA. If the condition guarding the assignment is
+  /// not met, then the assignment retains this value.
+  Value *InitScalar = nullptr;
+
+  /// The Instruction that conditionally assigned to inside the loop.
+  Instruction *Assignment = nullptr;
+
+  /// Create a CSA Descriptor that models a valid CSA with its members
+  /// initialized correctly.
+  CSADescriptor(PHINode *Phi, Instruction *Assignment, Value *InitScalar)
+      : Phi(Phi), InitScalar(InitScalar), Assignment(Assignment) {}
+
+public:
+  /// Create a CSA Descriptor that models an invalid CSA.
+  CSADescriptor() = default;
+
+  /// If Phi is the root of a CSA, set CSADesc as the CSA rooted by
+  /// Phi. Otherwise, return a false, leaving CSADesc unmodified.
+  static bool isCSAPhi(PHINode *Phi, Loop *TheLoop, CSADescriptor &CSADesc);
+
+  operator bool() const { return isValid(); }
+
+  /// Returns whether SI is the Assignment in CSA
+  static bool isCSASelect(CSADescriptor Desc, SelectInst *SI) {
+    return Desc.getAssignment() == SI;
+  }
+
+  /// Return whether this CSADescriptor models a valid CSA.
+  bool isValid() const { return Phi && InitScalar && Assignment; }
+
+  /// Return the PHI that roots this CSA.
+  PHINode *getPhi() const { return Phi; }
+
+  /// Return the initial value of the CSA. This is the value if the conditional
+  /// assignment does not occur.
+  Value *getInitScalar() const { return InitScalar; }
+
+  /// The Instruction that is used after the loop
+  Instruction *getAssignment() const { return Assignment; }
+
+  /// Return the condition that this CSA is conditional upon.
+  Value *getCond() const {
+    if (auto *SI = dyn_cast_or_null<SelectInst>(Assignment))
+      return SI->getCondition();
+    return nullptr;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_IVDESCRIPTORS_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 78d6a6cb7a59f8..1375218b2f8fa4 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -27,7 +27,6 @@
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H
 
 #include "llvm/ADT/MapVector.h"
-#include "llvm/Analysis/CSADescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 24ca426990d9ed..393803fad89383 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -46,7 +46,6 @@ add_llvm_component_library(LLVMAnalysis
   CostModel.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
-  CSADescriptors.cpp
   CtxProfAnalysis.cpp
   CycleAnalysis.cpp
   DDG.cpp
diff --git a/llvm/lib/Analysis/CSADescriptors.cpp b/llvm/lib/Analysis/CSADescriptors.cpp
deleted file mode 100644
index 1f4ac5ef8b2516..00000000000000
--- a/llvm/lib/Analysis/CSADescriptors.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-//===----------- CSADescriptors..cpp - CSA Descriptors ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file "describes" conditional scalar assignments (CSA).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/CSADescriptors.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "csa-descriptors"
-
-/// Return CSADescriptor that describes a CSA that matches one of these
-/// patterns:
-///   phi loop_inv, (select cmp, value, phi)
-///   phi loop_inv, (select cmp, phi, value)
-///   phi (select cmp, value, phi), loop_inv
-///   phi (select cmp, phi, value), loop_inv
-/// If the CSA does not match any of these paterns, return a CSADescriptor
-/// that describes an InvalidCSA.
-CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
-
-  // Must be a scalar
-  Type *Type = Phi->getType();
-  if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
-      !Type->isPointerTy())
-    return {};
-
-  // Match phi loop_inv, (select cmp, value, phi)
-  //    or phi loop_inv, (select cmp, phi, value)
-  //    or phi (select cmp, value, phi), loop_inv
-  //    or phi (select cmp, phi, value), loop_inv
-  if (Phi->getNumIncomingValues() != 2)
-    return {};
-  auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](const Use &U) {
-    return match(U.get(), m_Select(m_Value(), m_Specific(Phi), m_Value())) ||
-           match(U.get(), m_Select(m_Value(), m_Value(), m_Specific(Phi)));
-  });
-  if (SelectInstIt == Phi->incoming_values().end())
-    return {};
-  auto LoopInvIt = find_if(Phi->incoming_values(), [&](Use &U) {
-    return U.get() != *SelectInstIt && TheLoop->isLoopInvariant(U.get());
-  });
-  if (LoopInvIt == Phi->incoming_values().end())
-    return {};
-
-  // Phi or Sel must be used only outside the loop,
-  // excluding if Phi use Sel or Sel use Phi
-  auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
-    return all_of(V->users(), [Ignore, TheLoop](User *U) {
-      if (U == Ignore)
-        return true;
-      if (auto *I = dyn_cast<Instruction>(U))
-        return !TheLoop->contains(I);
-      return true;
-    });
-  };
-  Instruction *Select = cast<SelectInst>(SelectInstIt->get());
-  Value *LoopInv = LoopInvIt->get();
-  if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
-      !IsOnlyUsedOutsideLoop(Select, Phi))
-    return {};
-
-  return {Phi, Select, LoopInv};
-}
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 53001421ce6f7a..31be06fd451e8d 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file "describes" induction and recurrence variables.
+// This file "describes" induction, recurrence, and conditional scalar
+// assignment (CSA) variables.
 //
 //===----------------------------------------------------------------------===//
 
@@ -1472,3 +1473,58 @@ bool InductionDescriptor::isInductionPHI(
   D = InductionDescriptor(StartValue, IK_PtrInduction, Step);
   return true;
 }
+
+/// Return CSADescriptor that describes a CSA that matches one of these
+/// patterns:
+///   phi loop_inv, (select cmp, value, phi)
+///   phi loop_inv, (select cmp, phi, value)
+///   phi (select cmp, value, phi), loop_inv
+///   phi (select cmp, phi, value), loop_inv
+/// If the CSA does not match any of these paterns, return a CSADescriptor
+/// that describes an InvalidCSA.
+bool CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop, CSADescriptor &CSA) {
+
+  // Must be a scalar
+  Type *Type = Phi->getType();
+  if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+      !Type->isPointerTy())
+    return false;
+
+  // Match phi loop_inv, (select cmp, value, phi)
+  //    or phi loop_inv, (select cmp, phi, value)
+  //    or phi (select cmp, value, phi), loop_inv
+  //    or phi (select cmp, phi, value), loop_inv
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+  auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](const Use &U) {
+    return match(U.get(), m_Select(m_Value(), m_Specific(Phi), m_Value())) ||
+           match(U.get(), m_Select(m_Value(), m_Value(), m_Specific(Phi)));
+  });
+  if (SelectInstIt == Phi->incoming_values().end())
+    return false;
+  auto LoopInvIt = find_if(Phi->incoming_values(), [&](Use &U) {
+    return U.get() != *SelectInstIt && TheLoop->isLoopInvariant(U.get());
+  });
+  if (LoopInvIt == Phi->incoming_values().end())
+    return false;
+
+  // Phi or Sel must be used only outside the loop,
+  // excluding if Phi use Sel or Sel use Phi
+  auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+    return all_of(V->users(), [Ignore, TheLoop](User *U) {
+      if (U == Ignore)
+        return true;
+      if (auto *I = dyn_cast<Instruction>(U))
+        return !TheLoop->contains(I);
+      return true;
+    });
+  };
+  Instruction *Select = cast<SelectInst>(SelectInstIt->get());
+  Value *LoopInv = LoopInvIt->get();
+  if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+      !IsOnlyUsedOutsideLoop(Select, Phi))
+    return false;
+
+  CSA = CSADescriptor(Phi, Select, LoopInv);
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 9f6028e398e68c..72dd7108ca2594 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -894,7 +894,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // Check if the PHI can be classified as a CSA PHI.
         if (EnableCSA || (TTI->enableCSAVectorization() &&
                           EnableCSA.getNumOccurrences() == 0)) {
-          if (auto CSADesc = CSADescriptor::isCSAPhi(Phi, TheLoop)) {
+          CSADescriptor CSADesc;
+          if (CSADescriptor::isCSAPhi(Phi, TheLoop, CSADesc)) {
             addCSAPhi(Phi, CSADesc, AllowedExit);
             continue;
           }

>From fec971ad515e603b5604d6bd712a769ac0f53d45 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 11 Oct 2024 12:20:57 -0700
Subject: [PATCH 22/23] fixup! don't make anyactive specific to csa

---
 .../Vectorize/LoopVectorizationPlanner.h      |  8 +++----
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  4 ++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 24 +++++++++----------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 10 ++++----
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  2 +-
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 77c425a6710a3c..4bf6099cd7466c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -240,9 +240,9 @@ class VPBuilder {
     return createInstruction(VPInstruction::CSAMaskPhi, {InitMask}, DL, Name);
   }
 
-  VPInstruction *createCSAAnyActive(VPValue *Cond, DebugLoc DL,
+  VPInstruction *createAnyActive(VPValue *Cond, DebugLoc DL,
                                     const Twine &Name) {
-    return createInstruction(VPInstruction::CSAAnyActive, {Cond}, DL, Name);
+    return createInstruction(VPInstruction::AnyActive, {Cond}, DL, Name);
   }
 
   VPInstruction *createCSAMaskSel(VPValue *Cond, VPValue *MaskPhi,
@@ -252,9 +252,9 @@ class VPBuilder {
                              {Cond, MaskPhi, AnyActive}, DL, Name);
   }
 
-  VPInstruction *createCSAAnyActiveEVL(VPValue *Cond, VPValue *EVL, DebugLoc DL,
+  VPInstruction *createAnyActiveEVL(VPValue *Cond, VPValue *EVL, DebugLoc DL,
                                        const Twine &Name) {
-    return createInstruction(VPInstruction::CSAAnyActiveEVL, {Cond, EVL}, DL,
+    return createInstruction(VPInstruction::AnyActiveEVL, {Cond, EVL}, DL,
                              Name);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 295e9b93a23028..cd20d4c9ea67c4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8761,7 +8761,7 @@ addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
     }
 
     auto *VPAnyActive =
-        B.createCSAAnyActive(CondToUse, DL, "csa.cond.anyactive");
+        B.createAnyActive(CondToUse, DL, "csa.cond.anyactive");
     VPAnyActive->insertBefore(
         GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c4c9c1781e76af..ddc915216ce9c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1320,8 +1320,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
     CSAMaskSel,
     CSAVLPhi,
     CSAVLSel,
-    CSAAnyActive,
-    CSAAnyActiveEVL,
+    AnyActive,
+    AnyActiveEVL,
   };
 
 private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 61c54050611ba9..1f49b0668b65c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -390,8 +390,8 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::CSAVLSel:
   case VPInstruction::CSAVLPhi:
-  case VPInstruction::CSAAnyActive:
-  case VPInstruction::CSAAnyActiveEVL:
+  case VPInstruction::AnyActive:
+  case VPInstruction::AnyActiveEVL:
     return true;
   default:
     return false;
@@ -705,11 +705,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
     cast<PHINode>(MaskPhi)->addIncoming(MaskSel, State.CFG.PrevBB);
     return MaskSel;
   }
-  case VPInstruction::CSAAnyActive: {
+  case VPInstruction::AnyActive: {
     Value *WidenedCond = State.get(getOperand(0));
     return Builder.CreateOrReduce(WidenedCond);
   }
-  case VPInstruction::CSAAnyActiveEVL: {
+  case VPInstruction::AnyActiveEVL: {
     Value *WidenedCond = State.get(getOperand(0));
     Value *AllOnesMask = Constant::getAllOnesValue(
         VectorType::get(Type::getInt1Ty(State.Builder.getContext()), State.VF));
@@ -720,7 +720,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *AnyActive = State.Builder.CreateIntrinsic(
         WidenedCond->getType()->getScalarType(), Intrinsic::vp_reduce_or,
         {StartValue, WidenedCond, AllOnesMask, EVL}, nullptr,
-        "csa.cond.anyactive");
+        "any.active");
     return AnyActive;
   }
   case VPInstruction::CSAVLPhi: {
@@ -753,8 +753,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::CSAAnyActive ||
-         getOpcode() == VPInstruction::CSAAnyActiveEVL;
+         getOpcode() == VPInstruction::AnyActive ||
+         getOpcode() == VPInstruction::AnyActiveEVL;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -947,11 +947,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::CSAVLSel:
     O << "csa-vl-sel";
     break;
-  case VPInstruction::CSAAnyActive:
-    O << "csa-anyactive";
+  case VPInstruction::AnyActive:
+    O << "anyactive";
     break;
-  case VPInstruction::CSAAnyActiveEVL:
-    O << "csa-anyactive-evl";
+  case VPInstruction::AnyActiveEVL:
+    O << "anyactive-evl";
     break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
@@ -2346,7 +2346,7 @@ InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
   // them here for now since they are related to updating the data and there is
   // no VPInstruction::computeCost support at the moment.
 
-  // CSAAnyActive
+  // AnyActive
   C += TTI.getArithmeticReductionCost(Instruction::Or, VTy, std::nullopt,
                                       CostKind);
   // VPVLSel
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a4cc358d0011c5..14356372445754 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1350,19 +1350,19 @@ void VPlanTransforms::addActiveLaneMask(
 }
 
 /// Add recipes required to make CSA work with EVL based approach. This
-/// includes replacing \p CSAAnyActive with \p CSAAnyActiveEVL, and adding \p
+/// includes replacing \p AnyActive with \p AnyActiveEVL, and adding \p
 /// CSAVLPhi and \p CSAVLSel instructions.
 static void addExplicitVectorLengthForCSA(
     VPValue &EVL, const MapVector<PHINode *, VPCSAState *> &CSAStates) {
   for (auto &[_, CSAState] : CSAStates) {
-    // CSAAnyActive is used to keep track of whether any condition on the
+    // AnyActive is used to keep track of whether any condition on the
     // current iteration is active. This is used to decide whether the mask
     // should be updated. When we are using EVL, we must only consider the first
-    // EVL number of elements in the mask. Replace CSAAnyActive with the EVL
-    // specific CSAAnyActiveEVL instruction.
+    // EVL number of elements in the mask. Replace AnyActive with the EVL
+    // specific AnyActiveEVL instruction.
     auto *VPAnyActive = CSAState->getVPAnyActive();
     VPBuilder B;
-    auto *VPAnyActiveEVL = B.createCSAAnyActiveEVL(
+    auto *VPAnyActiveEVL = B.createAnyActiveEVL(
         VPAnyActive->getOperand(0), &EVL, VPAnyActive->getDebugLoc(),
         "csa.cond.anyactive");
     VPAnyActiveEVL->insertBefore(VPAnyActive);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e7fea8ac2fafa8..dc57ed971ea546 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -156,7 +156,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
                  [&](const VPScalarCastRecipe *S) { return true; })
              .Case<VPInstruction>([&](const VPInstruction *I) {
                unsigned Opc = I->getOpcode();
-               if (Opc == VPInstruction::CSAAnyActiveEVL ||
+               if (Opc == VPInstruction::AnyActiveEVL ||
                    Opc == VPInstruction::CSAVLSel)
                  return true;
 

>From d18455c8c4c5cac7e76a681a75c793ad43f429d6 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 14 Oct 2024 06:09:22 -0700
Subject: [PATCH 23/23] fixup! remove CSAInitMask and CSAInitData recipes

---
 .../Vectorize/LoopVectorizationPlanner.h      |  10 -
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  18 -
 .../RISCV/conditional-scalar-assignment.ll    | 782 ++++++++++++++++--
 5 files changed, 735 insertions(+), 96 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4bf6099cd7466c..9eda4ff91e230d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -225,16 +225,6 @@ class VPBuilder {
     return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
   }
 
-  VPInstruction *createCSAInitMask(DebugLoc DL, const Twine &Name) {
-    return createInstruction(VPInstruction::CSAInitMask, {}, DL, Name);
-  }
-
-  VPInstruction *createCSAInitData(VPValue *InitScalar, DebugLoc DL,
-                                   const Twine &Name) {
-    return createInstruction(VPInstruction::CSAInitData, {InitScalar}, DL,
-                             Name);
-  }
-
   VPInstruction *createCSAMaskPhi(VPValue *InitMask, DebugLoc DL,
                                   const Twine &Name) {
     return createInstruction(VPInstruction::CSAMaskPhi, {InitMask}, DL, Name);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cd20d4c9ea67c4..b9a5a3d4514f09 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8689,7 +8689,7 @@ static void
 addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
                         Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
                         VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
-                        VPlan &Plan) {
+                        VPlan &Plan, VPRecipeBuilder &Builder) {
 
   // Don't build full CSA for VF=ElementCount::getFixed(1)
   bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8708,8 +8708,10 @@ addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
     }
 
     VPBuilder PHB(PreheaderVPBB);
-    auto *VPInitMask = PHB.createCSAInitMask(DL, "csa.init.mask");
-    auto *VPInitData = PHB.createCSAInitData(VPInitScalar, DL, "csa.init.data");
+    auto *VPInitMask = Builder.getVPValueOrAddLiveIn(
+        ConstantInt::getFalse(Type::getInt1Ty(CSA.first->getContext())));
+    auto *VPInitData =
+        Builder.getVPValueOrAddLiveIn(PoisonValue::get(CSA.first->getType()));
 
     VPBuilder HB(HeaderVPBB);
     auto *VPMaskPhi = HB.createCSAMaskPhi(VPInitMask, DL, "csa.mask.phi");
@@ -9096,11 +9098,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+
   addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
                           Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
-                          Range, *Plan);
+                          Range, *Plan, RecipeBuilder);
 
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
 
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ddc915216ce9c0..6cf6a926112aaa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -240,7 +240,7 @@ class VPCSAExtractScalarRecipe;
 /// assignment.
 class VPCSAState {
   VPValue *VPInitScalar = nullptr;
-  VPInstruction *VPInitData = nullptr;
+  VPValue *VPInitData = nullptr;
   VPInstruction *VPMaskPhi = nullptr;
   VPInstruction *VPAnyActive = nullptr;
   VPCSAHeaderPHIRecipe *VPPhiRecipe = nullptr;
@@ -248,7 +248,7 @@ class VPCSAState {
   VPCSAExtractScalarRecipe *VPExtractScalar = nullptr;
 
 public:
-  VPCSAState(VPValue *VPInitScalar, VPInstruction *InitData,
+  VPCSAState(VPValue *VPInitScalar, VPValue *InitData,
              VPInstruction *MaskPhi)
       : VPInitScalar(VPInitScalar), VPInitData(InitData), VPMaskPhi(MaskPhi) {}
 
@@ -256,7 +256,7 @@ class VPCSAState {
 
   VPValue *getVPInitScalar() const { return VPInitScalar; }
 
-  VPInstruction *getVPInitData() const { return VPInitData; }
+  VPValue *getVPInitData() const { return VPInitData; }
 
   VPInstruction *getVPMaskPhi() const { return VPMaskPhi; }
 
@@ -1314,8 +1314,6 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
-    CSAInitMask,
-    CSAInitData,
     CSAMaskPhi,
     CSAMaskSel,
     CSAVLPhi,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f49b0668b65c5..d0d5017a6ea228 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -673,18 +673,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
-  case VPInstruction::CSAInitMask: {
-    Value *InitMask = ConstantAggregateZero::get(
-        VectorType::get(Type::getInt1Ty(State.Builder.getContext()), State.VF));
-    State.set(this, InitMask);
-    return InitMask;
-  }
-  case VPInstruction::CSAInitData: {
-    Type *ElemTyp = getOperand(0)->getUnderlyingValue()->getType();
-    Value *InitData = PoisonValue::get(VectorType::get(ElemTyp, State.VF));
-    State.set(this, InitData);
-    return InitData;
-  }
   case VPInstruction::CSAMaskPhi: {
     IRBuilder<>::InsertPointGuard Guard(State.Builder);
     State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
@@ -929,12 +917,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
-  case VPInstruction::CSAInitMask:
-    O << "csa-init-mask";
-    break;
-  case VPInstruction::CSAInitData:
-    O << "csa-init-data";
-    break;
   case VPInstruction::CSAMaskPhi:
     O << "csa-mask-phi";
     break;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index 26e262c7c302f7..da381b269aa92d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -41,7 +41,7 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
@@ -112,7 +112,7 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
 ; NO-EVL:       [[VECTOR_BODY]]:
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
@@ -194,8 +194,31 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t; // use t
 ; }
 define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; EVL-NOT: vector.body:
+; EVL-LABEL: define i32 @simple_csa_int_select_induction_cmp(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT:.*]]:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 ; NO-EVL-LABEL: define i32 @simple_csa_int_select_induction_cmp(
 ; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
@@ -227,7 +250,7 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
@@ -332,7 +355,7 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
@@ -399,7 +422,7 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
 ; NO-EVL:       [[VECTOR_BODY]]:
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
@@ -478,8 +501,36 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t; // use t
 ; }
 define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
-; CHECK-LABEL: @simple_csa_int(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @simple_csa_int(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND:%.*]], ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP6]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[T_1_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_07:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi i32 [ [[TMP1]], %[[IF_THEN]] ], [ [[T_07]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
@@ -523,8 +574,36 @@ for.inc:                                          ; preds = %loop, %if.then
 ;   return t; // use t
 ; }
 define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
-; CHECK-LABEL: @simple_csa_float(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @simple_csa_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP6]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[T_1_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_07:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi float [ [[TMP1]], %[[IF_THEN]] ], [ [[T_07]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp6 = icmp sgt i32 %N, 0
@@ -596,9 +675,9 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
@@ -692,8 +771,8 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
@@ -814,8 +893,39 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t | s; // use t and s
 ; }
 define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; EVL-NOT: vector.body:
+; EVL-LABEL: define i32 @csa_in_series_int_select_induction_cmp(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP21]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT:.*]]:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
+; EVL-NEXT:    ret i32 [[OR]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[S_023:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_022:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP2]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; EVL-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP4]]
+; EVL-NEXT:    [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 ; NO-EVL-LABEL: define i32 @csa_in_series_int_select_induction_cmp(
 ; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
@@ -848,8 +958,8 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
@@ -995,9 +1105,9 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL7:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL8:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL6:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
@@ -1085,8 +1195,8 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[TMP6]]
 ; NO-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
@@ -1201,8 +1311,49 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t | s; // use t and s
 ; }
 define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_int(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @csa_in_series_int(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[S_017:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], %[[IF_THEN]] ], [ [[T_016]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[IF_THEN6:.*]]
+; CHECK:       [[IF_THEN6]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[S_1]] = phi i32 [ [[TMP4]], %[[IF_THEN6]] ], [ [[S_017]], %[[IF_END]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1267,8 +1418,49 @@ for.inc:                                          ; preds = %if.end, %if.then6
 ;   return t + s; // use t and s
 ; }
 define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_float(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @csa_in_series_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ 2.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    ret float [[ADD]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[T_1]] = phi float [ [[TMP2]], %[[IF_THEN]] ], [ [[T_016]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[IF_THEN6:.*]]
+; CHECK:       [[IF_THEN6]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[S_1]] = phi float [ [[TMP4]], %[[IF_THEN6]] ], [ [[S_017]], %[[IF_END]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1332,8 +1524,36 @@ for.inc:                                          ; preds = %if.end, %if.then6
 ;   return t; // use t
 ; }
 define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_same_scalar_int_select(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @csa_in_series_same_scalar_int_select(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[T_2_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[T_022:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP1]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP3]]
+; CHECK-NEXT:    [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %N, 0
@@ -1378,8 +1598,34 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t; // use t
 ; }
 define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_same_scalar_float_select(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @csa_in_series_same_scalar_float_select(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP19]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[T_2_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[T_020:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp19 = icmp sgt i32 %N, 0
@@ -1422,8 +1668,46 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t; // use t
 ; }
 define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_same_scalar_int(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @csa_in_series_same_scalar_int(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[T_2_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_2]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[T_1:%.*]] = phi i32 [ [[TMP1]], %[[IF_THEN]] ], [ [[T_016]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[IF_THEN6:.*]]
+; CHECK:       [[IF_THEN6]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_2]] = phi i32 [ [[TMP3]], %[[IF_THEN6]] ], [ [[T_1]], %[[IF_END]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1482,8 +1766,46 @@ for.inc:                                          ; preds = %if.end, %if.then6
 ;   return t; // use t
 ; }
 define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_in_series_same_scalar_float(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @csa_in_series_same_scalar_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[T_2_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_2]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[T_1:%.*]] = phi float [ [[TMP1]], %[[IF_THEN]] ], [ [[T_016]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[IF_THEN6:.*]]
+; CHECK:       [[IF_THEN6]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_2]] = phi float [ [[TMP3]], %[[IF_THEN6]] ], [ [[T_1]], %[[IF_END]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1542,8 +1864,42 @@ for.inc:                                          ; preds = %if.end, %if.then6
 ;   return t | s; // use t and s
 ; }
 define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_same_cond_int(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @csa_same_cond_int(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[S_011:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_010:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], %[[IF_THEN]] ], [ [[T_010]], %[[LOOP]] ]
+; CHECK-NEXT:    [[S_1]] = phi i32 [ [[TMP3]], %[[IF_THEN]] ], [ [[S_011]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -1598,8 +1954,42 @@ for.inc:                                          ; preds = %loop, %if.then
 ;   return t + s; // use t and s
 ; }
 define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_same_cond_float(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @csa_same_cond_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ 2.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    ret float [[ADD]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[S_011:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_010:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi float [ [[TMP2]], %[[IF_THEN]] ], [ [[T_010]], %[[LOOP]] ]
+; CHECK-NEXT:    [[S_1]] = phi float [ [[TMP3]], %[[IF_THEN]] ], [ [[S_011]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -1654,8 +2044,42 @@ for.inc:                                          ; preds = %loop, %if.then
 ;   return t; // use t
 ; }
 define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_else_if_same_scalar_int(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @csa_else_if_same_scalar_int(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[T_1_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_ELSE:.*]], label %[[FOR_INC_SINK_SPLIT:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[FOR_INC_SINK_SPLIT]]
+; CHECK:       [[FOR_INC_SINK_SPLIT]]:
+; CHECK-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0]], %[[LOOP]] ], [ [[DATA1]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi i32 [ [[T_016]], %[[IF_ELSE]] ], [ [[TMP2]], %[[FOR_INC_SINK_SPLIT]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1709,8 +2133,42 @@ for.inc:                                          ; preds = %for.inc.sink.split,
 ;   return t; // use t
 ; }
 define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_else_if_same_scalar_float(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @csa_else_if_same_scalar_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, %[[ENTRY]] ], [ [[T_1_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[T_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_ELSE:.*]], label %[[FOR_INC_SINK_SPLIT:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[FOR_INC_SINK_SPLIT]]
+; CHECK:       [[FOR_INC_SINK_SPLIT]]:
+; CHECK-NEXT:    [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0]], %[[LOOP]] ], [ [[DATA1]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi float [ [[T_016]], %[[IF_ELSE]] ], [ [[TMP2]], %[[FOR_INC_SINK_SPLIT]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1764,8 +2222,49 @@ for.inc:                                          ; preds = %for.inc.sink.split,
 ;   return t | s; // use t and s
 ; }
 define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_else_if_int(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define i32 @csa_else_if_int(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[S_017:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi i32 [ -1, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[IF_THEN6:.*]]
+; CHECK:       [[IF_THEN6]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi i32 [ [[TMP2]], %[[IF_THEN]] ], [ [[T_016]], %[[IF_THEN6]] ], [ [[T_016]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[S_1]] = phi i32 [ [[S_017]], %[[IF_THEN]] ], [ [[TMP4]], %[[IF_THEN6]] ], [ [[S_017]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1830,8 +2329,49 @@ for.inc:                                          ; preds = %if.then, %if.then6,
 ;   return t + s; // use t and s
 ; }
 define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; CHECK-LABEL: @csa_else_if_float(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define float @csa_else_if_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[COND0:%.*]], ptr [[COND1:%.*]], ptr [[DATA0:%.*]], ptr [[DATA1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD:%.*]] = phi float [ [[TMP0]], %[[EXIT_LOOPEXIT]] ], [ 2.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    ret float [[ADD]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[S_017:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[S_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[T_016:%.*]] = phi float [ 1.000000e+00, %[[LOOP_PREHEADER]] ], [ [[T_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL5_NOT]], label %[[FOR_INC]], label %[[IF_THEN6:.*]]
+; CHECK:       [[IF_THEN6]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[T_1]] = phi float [ [[TMP2]], %[[IF_THEN]] ], [ [[T_016]], %[[IF_THEN6]] ], [ [[T_016]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[S_1]] = phi float [ [[S_017]], %[[IF_THEN]] ], [ [[TMP4]], %[[IF_THEN6]] ], [ [[S_017]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp15 = icmp sgt i32 %N, 0
@@ -1890,8 +2430,31 @@ for.inc:                                          ; preds = %if.then, %if.then6,
 ;   return idx;
 ; }
 define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar(
-; EVL-NOT: vector.body:
+; EVL-LABEL: define i64 @idx_scalar(
+; EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i64 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP8_NOT]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT:.*]]:
+; EVL-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    [[IDX_09:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[II]], %[[LOOP_PREHEADER]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; EVL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; EVL-NEXT:    [[INC]] = add nuw i64 [[I_010]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 ; NO-EVL-LABEL: define i64 @idx_scalar(
 ; NO-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
@@ -1922,7 +2485,7 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 poison, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
@@ -2006,8 +2569,31 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return idx;
 ; }
 define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; CHECK-LABEL: @idx_scalar_dec(
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define dso_local i64 @idx_scalar_dec(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_NOT9:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT9]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[EXIT_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[IDX_0_LCSSA]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], %[[LOOP]] ], [ [[N]], %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[IDX_010:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[II]], %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[SUB]] = add i64 [[I_011]], -1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp.not9 = icmp eq i64 %n, 0
@@ -2046,8 +2632,31 @@ loop:                                         ; preds = %loop.preheader, %loop
 ;   return t; // use t
 ; }
 define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_neg_cond(
-; EVL-NOT: vector.body:
+; EVL-LABEL: define i32 @simple_csa_int_select_neg_cond(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL:       [[LOOP_PREHEADER]]:
+; EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[EXIT_LOOPEXIT:.*]]:
+; EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    br label %[[EXIT]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; EVL-NEXT:    ret i32 [[T_0_LCSSA]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[T_010:%.*]] = phi i32 [ 0, %[[LOOP_PREHEADER]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
+; EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; EVL-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP1]]
+; EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 ; NO-EVL-LABEL: define i32 @simple_csa_int_select_neg_cond(
 ; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]]) #[[ATTR0]] {
@@ -2079,7 +2688,7 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
 ; NO-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; NO-EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 poison, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; NO-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; NO-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP10]]
 ; NO-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
@@ -2187,7 +2796,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ shufflevector (<vscale x 2 x ptr> insertelement (<vscale x 2 x ptr> poison, ptr poison, i64 0), <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer), %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
 ; EVL-NEXT:    [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
 ; EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
@@ -2236,8 +2845,32 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
 ; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
-; NO-EVL-LABEL: @simple_csa_ptr_select(
-; NO-EVL-NOT: vector.body:
+; NO-EVL-LABEL: define ptr @simple_csa_ptr_select(
+; NO-EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]]) #[[ATTR0]] {
+; NO-EVL-NEXT:  [[ENTRY:.*]]:
+; NO-EVL-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; NO-EVL-NEXT:    br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; NO-EVL:       [[LOOP_PREHEADER]]:
+; NO-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT:    br label %[[LOOP:.*]]
+; NO-EVL:       [[EXIT_LOOPEXIT:.*]]:
+; NO-EVL-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    br label %[[EXIT]]
+; NO-EVL:       [[EXIT]]:
+; NO-EVL-NEXT:    [[T_0_LCSSA:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; NO-EVL-NEXT:    ret ptr [[T_0_LCSSA]]
+; NO-EVL:       [[LOOP]]:
+; NO-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[T_010:%.*]] = phi ptr [ null, %[[LOOP_PREHEADER]] ], [ [[SPEC_SELECT]], %[[LOOP]] ]
+; NO-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
+; NO-EVL-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; NO-EVL-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP2]]
+; NO-EVL-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %N, 0
@@ -2264,3 +2897,36 @@ loop:
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
   br i1 %exitcond.not, label %exit, label %loop
 }
+;.
+; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+;.
+; NO-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; NO-EVL: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; NO-EVL: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.



More information about the llvm-commits mailing list