[llvm] [LoopVectorize] Add support for vectorisation of more early exit loops (PR #88385)

Thu Jul 25 05:19:02 PDT 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/88385

>From e08077c04db348cea004369db4510d2362bd89a8 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 25 Jul 2024 12:05:04 +0000
Subject: [PATCH 1/3] [LoopVectorize] Add tests for dereferenceable loads in
 more loops

* Adds tests for strided accesses.
* Adds tests for reverse loops.

As part of this I've moved one of the negative tests from
load-deref-pred-align.ll into a new file
(load-deref-pred-neg-off.ll) because the pointer type had a
size of 16 bits and I realised it's probably not sensible for
allocas that are >16 bits in size!

I've also tweaked
scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
to use an alloca for the array access, rather than a global since
this more reliably leads to generation of dereferenceable loads.
---
 ...demanding-all-lanes-and-first-lane-only.ll |  38 +-
 .../LoopVectorize/load-deref-pred-align.ll    | 428 +++++++++++++++++-
 .../LoopVectorize/load-deref-pred-neg-off.ll  | 104 +++++
 3 files changed, 552 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll

diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fcf1ba072a62c..00081a42151f6 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
- at src = external global [8 x i32], align 4
+declare void @init()
 
 ; Test case where scalar steps are used by both a VPReplicateRecipe (demands
 ; all scalar lanes) and a VPInstruction that only demands the first lane.
@@ -12,34 +12,36 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-LABEL: define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(
 ; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC_1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [8 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[SRC]])
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[L_1]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr [[SRC]], i64 0, i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
@@ -78,26 +80,28 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L_1]], 0
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[MUL_IV1:%.*]] = mul nsw i64 [[IV1]], 4
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV1]]
+; CHECK-NEXT:    [[L_3:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L_3]], 0
 ; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[IV_OR:%.*]] = or disjoint i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 [[IV_OR]]
+; CHECK-NEXT:    [[IV_OR:%.*]] = or disjoint i64 [[IV1]], 4
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [8 x i32], ptr [[SRC]], i64 0, i64 [[IV_OR]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    store i32 [[L_2]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %src = alloca [8 x i32], align 4
+  call void @init(ptr %src)
   br label %loop.header
 
 loop.header:
@@ -110,7 +114,7 @@ loop.header:
 
 then:
   %iv.or = or disjoint i64 %iv, 4
-  %gep.src = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 %iv.or
+  %gep.src = getelementptr inbounds [8 x i32], ptr %src, i64 0, i64 %iv.or
   %l.2 = load i32, ptr %gep.src, align 4
   store i32 %l.2, ptr %dst, align 4
   br label %loop.latch
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index a7c9a18127ade..344181bfa0d1e 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
 declare void @init(ptr nocapture nofree)
 
@@ -296,3 +296,429 @@ latch:
 loop_exit:
   ret i8 %accum.next
 }
+
+; Test reverse loops where we can prove loads in predicated blocks are safe
+; to load unconditionally.
+define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i32 [[TMP8]], 2
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nsw i32 [[TMP14]], 2
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP18]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP19]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) [[DEST:%.*]], ptr noundef nonnull align 4 dereferenceable(1024) [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %indvars.iv
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp2.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) %dest, ptr noundef nonnull align 4 dereferenceable(1024) %local_dest, i64 1024, i1 false)
+  ret void
+}
+
+
+; Test reverse loops where we *cannot* prove loads in predicated blocks are safe
+; to load unconditionally.
+define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_non_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1023, i64 1022>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], <i64 -1, i64 -1>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP5]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw i32 [[TMP10]], 2
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 -2, i64 -2>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[OFF:%.*]] = add i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP22]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP23]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) [[DEST:%.*]], ptr noundef nonnull align 4 dereferenceable(1024) [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.inc ]
+  %off = add i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %off
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %off
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %off
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp2.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) %dest, ptr noundef nonnull align 4 dereferenceable(1024) %local_dest, i64 1024, i1 false)
+  ret void
+}
+
+
+; Test a loop with a positive step recurrence that has a strided access
+define i16 @test_strided_access(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @test_strided_access(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [163840 x i16], align 4
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[IV_STRIDE:%.*]] = mul i64 [[IV]], 2
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV_STRIDE]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [163840 x i16], align 4
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %accum = phi i16 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i64 %iv, 1
+  %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %iv
+  %l.t = load i8, ptr %test_addr
+  %cmp = icmp sge i8 %l.t, 0
+  br i1 %cmp, label %pred, label %latch
+pred:
+  %iv.stride = mul i64 %iv, 2
+  %addr = getelementptr inbounds i16, ptr %alloca, i64 %iv.stride
+  %val = load i16, ptr %addr, align 2
+  br label %latch
+latch:
+  %val.phi = phi i16 [0, %loop], [%val, %pred]
+  %accum.next = add i16 %accum, %val.phi
+  %exit = icmp eq i64 %iv, 4095
+  br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+  ret i16 %accum.next
+}
+
+
+; Test a loop with a negative step recurrence that has a strided access
+define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_strided_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 511, i64 510>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 511, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP10]], 2
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 -2, i64 -2>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 511, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INDVARS_IV_STRIDED:%.*]] = mul i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV_STRIDED]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP22]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) [[DEST:%.*]], ptr noundef nonnull align 4 dereferenceable(1024) [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 511, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %indvars.iv.strided = mul i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %indvars.iv.strided
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %indvars.iv
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp2.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) %dest, ptr noundef nonnull align 4 dereferenceable(1024) %local_dest, i64 1024, i1 false)
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
new file mode 100644
index 0000000000000..1dd526df503bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+
+declare void @init(ptr nocapture nofree)
+
+
+; Test where offset relative to alloca is negative and we shouldn't
+; treat predicated loads as being always dereferenceable.
+define i8 @test_negative_off(i16 %len, ptr %test_base) {
+; CHECK-LABEL: @test_negative_off(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [64638 x i8]
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
+  %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i16 %iv, 1
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
+  %earlycnd = load i1, ptr %test_addr
+  br i1 %earlycnd, label %pred, label %latch
+pred:
+  %addr = getelementptr i8, ptr %alloca, i16 %iv
+  %val = load i8, ptr %addr
+  br label %latch
+latch:
+  %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
+  %accum.next = add i8 %accum, %val.phi
+  %exit = icmp ugt i16 %iv, -990
+  br i1 %exit, label %loop_exit, label %loop
+loop_exit:
+  ret i8 %accum.next
+}

>From e3eebf415989d196cdd6a6c5b2235ccc799169d7 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 25 Jul 2024 12:05:32 +0000
Subject: [PATCH 2/3] [LoopVectorize] Add support for reverse loops in
 isDereferenceableAndAlignedInLoop

Currently when we encounter a negative step in the induction
variable isDereferenceableAndAlignedInLoop bails out because
the element size is signed greater than the step. This patch
adds support for negative steps in cases where we detect the
start address for the load is of the form base + offset. In
this case the address decrements in each iteration so we need
to calculate the access size differently.

The motivation for this patch comes from PR #88385 where a
reviewer requested reusing isDereferenceableAndAlignedInLoop,
but that PR itself does support reverse loops.
---
 llvm/lib/Analysis/Loads.cpp                   | 69 ++++++++++-----
 .../LoopVectorize/load-deref-pred-align.ll    | 85 ++++++++++---------
 2 files changed, 94 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 61c6aa5e5a3eb..0fdd264da8f9b 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -293,7 +293,15 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
 
   // TODO: Handle overlapping accesses.
   // We should be computing AccessSize as (TC - 1) * Step + EltSize.
-  if (EltSize.sgt(Step->getAPInt()))
+  bool StepIsNegative = Step->getAPInt().isNegative();
+  APInt AbsStep = Step->getAPInt().abs();
+  if (EltSize.ugt(AbsStep))
+    return false;
+
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  if (EltSize.urem(Alignment.value()) != 0)
     return false;
 
   // Compute the total access size for access patterns with unit stride and
@@ -301,18 +309,49 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
   // same.
   // For patterns with gaps (i.e. non unit stride), we are
   // accessing EltSize bytes at every Step.
-  APInt AccessSize = TC * Step->getAPInt();
+  APInt AccessSize = TC * AbsStep;
 
   assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
          "implied by addrec definition");
   Value *Base = nullptr;
   if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
+    if (StepIsNegative)
+      return false;
     Base = StartS->getValue();
   } else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
-    // Handle (NewBase + offset) as start value.
-    const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
-    const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
-    if (StartS->getNumOperands() == 2 && Offset && NewBase) {
+    const SCEV *End = AddRec->evaluateAtIteration(
+        SE.getConstant(StartS->getType(), TC - 1), SE);
+
+    // The step recurrence could be negative so it's necessary to find the min
+    // and max accessed addresses in the loop.
+    const SCEV *Min = SE.getUMinExpr(StartS, End);
+    const SCEV *Max = SE.getUMaxExpr(StartS, End);
+    if (isa<SCEVCouldNotCompute>(Min) || isa<SCEVCouldNotCompute>(Max))
+      return false;
+
+    // Now calculate the total access size, which is (max - min) + element_size.
+    const SCEV *Diff = SE.getMinusSCEV(Max, Min);
+    if (isa<SCEVCouldNotCompute>(Diff))
+      return false;
+
+    const SCEV *AS = SE.getAddExpr(
+        Diff, SE.getConstant(Diff->getType(), EltSize.getZExtValue()));
+    auto *ASC = dyn_cast<SCEVConstant>(AS);
+    if (!ASC)
+      return false;
+
+    if (const SCEVUnknown *NewBase = dyn_cast<SCEVUnknown>(Min)) {
+      Base = NewBase->getValue();
+      AccessSize = ASC->getAPInt();
+    } else if (auto *MinAddRec = dyn_cast<SCEVAddExpr>(Min)) {
+      if (MinAddRec->getNumOperands() != 2)
+        return false;
+
+      const auto *Offset = dyn_cast<SCEVConstant>(MinAddRec->getOperand(0));
+      const auto *NewBase = dyn_cast<SCEVUnknown>(MinAddRec->getOperand(1));
+      if (!Offset || !NewBase)
+        return false;
+
       // The following code below assumes the offset is unsigned, but GEP
       // offsets are treated as signed so we can end up with a signed value
       // here too. For example, suppose the initial PHI value is (i8 255),
@@ -325,22 +364,14 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
       // TODO: generalize if a case found which warrants
       if (Offset->getAPInt().urem(Alignment.value()) != 0)
         return false;
-      Base = NewBase->getValue();
-      bool Overflow = false;
-      AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
-      if (Overflow)
-        return false;
-    }
-  }
 
-  if (!Base)
+      AccessSize = ASC->getAPInt() + Offset->getAPInt();
+      Base = NewBase->getValue();
+    } else
+      return false;
+  } else
     return false;
 
-  // For the moment, restrict ourselves to the case where the access size is a
-  // multiple of the requested alignment and the base is aligned.
-  // TODO: generalize if a case found which warrants
-  if (EltSize.urem(Alignment.value()) != 0)
-    return false;
   return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
                                             HeaderFirstNonPHI, AC, &DT);
 }
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 344181bfa0d1e..f4a8777bf1487 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -311,7 +311,7 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
@@ -321,30 +321,33 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i32 [[TMP8]], 2
-; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nsw i32 [[TMP14]], 2
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -353,13 +356,13 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP18]], 3
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3
 ; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
@@ -635,26 +638,26 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP10]], 2
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:

>From 9fd4d59353215f7c68769bf197938ca4d9cec403 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 25 Jul 2024 12:05:58 +0000
Subject: [PATCH 3/3] [LoopVectorize] Add support for vectorisation of simple
 early exit loops

This patch adds support for vectorisation of a simple class of loops
that typically involves searching for something, i.e.

  for (int i = 0; i < n; i++) {
    if (p[i] == val)
      return i;
  }
  return n;

or

  for (int i = 0; i < n; i++) {
    if (p1[i] != p2[i])
      return i;
  }
  return n;

In this initial commit we only vectorise loops with the following
criteria:

1. There are no stores in the loop.
2. The loop must have only one early exit like those shown in the
above example. I have referred to such exits as speculative early
exits, to distinguish from existing support for early exits where
the exit-not-taken count is known exactly at compile time.
3. The early exit block dominates the latch block.
4. There are no loads after the early exit block.
5. The loop must not contain reductions or recurrences. I don't
see anything fundamental blocking vectorisation of such loops, but
I just haven't done the work to support them yet.
6. We must be able to prove at compile-time that loops will not
contain faulting loads.

For point 6 once this patch lands I intend to follow up by supporting
some limited cases of faulting loops where we can version the loop
based on pointer alignment. For example, it turns out in the
SPEC2017 benchmark there is a std::find loop that we can vectorise
provided we add SCEV checks for the initial pointer being aligned
to a multiple of the VF. In practice, the pointer is regularly
aligned to at least 32/64 bytes and since the VF is a power of 2, any
vector loads <= 32/64 bytes in size will always fault on the first
lane, following the same behaviour as the scalar loop. Given we
already do such speculative versioning for loops with unknown strides,
alignment-based versioning doesn't seem to be any worse.

This patch makes use of the existing experimental_cttz_elems intrinsic
that's required in the vectorised early exit block to determine the
first lane that triggered the exit. This intrinsic has generic
lowering support so it's guaranteed to work for all targets.

Tests have been added here:

  Transforms/LoopVectorize/AArch64/simple_early_exit.ll
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |    4 +-
 llvm/include/llvm/Analysis/ScalarEvolution.h  |   20 +
 llvm/include/llvm/IR/IRBuilder.h              |    7 +
 llvm/include/llvm/Support/GenericLoopInfo.h   |    4 +
 .../llvm/Support/GenericLoopInfoImpl.h        |   10 +
 .../Vectorize/LoopVectorizationLegality.h     |   56 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |    7 +-
 llvm/lib/Analysis/ScalarEvolution.cpp         |   26 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  183 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  427 ++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   49 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   74 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   57 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |    2 +-
 .../AArch64/simple_early_exit.ll              | 2829 +++++++++++++++++
 .../X86/vectorization-remarks-missed.ll       |   21 +-
 .../Transforms/LoopVectorize/control-flow.ll  |    2 +-
 17 files changed, 3694 insertions(+), 84 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index afafb74bdcb0a..e134e01d630fd 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -665,7 +665,8 @@ class LoopAccessInfo {
   bool isInvariant(Value *V) const;
 
   unsigned getNumStores() const { return NumStores; }
-  unsigned getNumLoads() const { return NumLoads;}
+  unsigned getNumLoads() const { return NumLoads; }
+  unsigned getNumCalls() const { return NumCalls; }
 
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
@@ -758,6 +759,7 @@ class LoopAccessInfo {
 
   unsigned NumLoads = 0;
   unsigned NumStores = 0;
+  unsigned NumCalls = 0;
 
   /// Cache the result of analyzeLoop.
   bool CanVecMem = false;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index d9bfca763819f..8074e3d7694cc 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -919,6 +919,18 @@ class ScalarEvolution {
   const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount(
       const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates);
 
+  /// Return true if we know exactly the backedge taken count for loop \p L and
+  /// block \p BB.
+  bool hasExactPredicatedBackedgeTakenCount(const Loop *L, BasicBlock *BB) {
+    return getPredicatedBackedgeTakenInfo(L).hasExact(BB, this);
+  }
+
+  /// Return all the exiting blocks in loop \p L with exact exit counts.
+  void getCountableExitingBlocks(const Loop *L,
+                                 SmallVector<BasicBlock *, 4> *Blocks) {
+    getBackedgeTakenInfo(L).getCountableExitingBlocks(L, this, Blocks);
+  }
+
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
   bool isBackedgeTakenCountMaxOrZero(const Loop *L);
@@ -1567,6 +1579,10 @@ class ScalarEvolution {
     const SCEV *getExact(const BasicBlock *ExitingBlock,
                          ScalarEvolution *SE) const;
 
+    /// Return all the exiting blocks in loop \p L with exact exit counts.
+    void getCountableExitingBlocks(const Loop *L, ScalarEvolution *SE,
+                                   SmallVector<BasicBlock *, 4> *Blocks) const;
+
     /// Get the constant max backedge taken count for the loop.
     const SCEV *getConstantMax(ScalarEvolution *SE) const;
 
@@ -1586,6 +1602,10 @@ class ScalarEvolution {
     /// Return true if the number of times this backedge is taken is either the
     /// value returned by getConstantMax or zero.
     bool isConstantMaxOrZero(ScalarEvolution *SE) const;
+
+    /// Return true if we have an exact exit-not-taken count for the exiting
+    /// block.
+    bool hasExact(const BasicBlock *ExitingBlock, ScalarEvolution *SE) const;
   };
 
   /// Cache the backedge-taken count of the loops for this function as they
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 31a1fef321995..1209f625f2064 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2518,6 +2518,13 @@ class IRBuilderBase {
     return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name);
   }
 
+  Value *CreateCountTrailingZeroElems(Type *ResTy, Value *Mask,
+                                      const Twine &Name = "") {
+    return CreateIntrinsic(
+        Intrinsic::experimental_cttz_elts, {ResTy, Mask->getType()},
+        {Mask, getInt1(/*ZeroIsPoison=*/true)}, nullptr, Name);
+  }
+
   Value *CreateExtractValue(Value *Agg, ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
     if (auto *V = Folder.FoldExtractValue(Agg, Idxs))
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index d560ca648132c..4d474fd272bb2 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -294,6 +294,10 @@ template <class BlockT, class LoopT> class LoopBase {
   /// Otherwise return null.
   BlockT *getUniqueExitBlock() const;
 
+  /// Return the exit block for the latch. This function assumes the loop has a
+  /// single latch.
+  BlockT *getLatchExitBlock() const;
+
   /// Return true if this loop does not have any exit blocks.
   bool hasNoExitBlocks() const;
 
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index d19022729ace3..d91619dad4dcd 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -159,6 +159,16 @@ BlockT *LoopBase<BlockT, LoopT>::getUniqueExitBlock() const {
   return getExitBlockHelper(this, true).first;
 }
 
+template <class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getLatchExitBlock() const {
+  BlockT *Latch = getLoopLatch();
+  assert(Latch && "Latch block must exist");
+  for (BlockT *Successor : children<BlockT *>(Latch))
+    if (!contains(Successor))
+      return Successor;
+  return nullptr;
+}
+
 /// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
 template <class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::getExitEdges(
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2ff17bd2f7a71..2eba8f47a1434 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -377,6 +377,26 @@ class LoopVectorizationLegality {
     return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
   }
 
+  /// Returns true if the loop has a early exit with a exact backedge
+  /// count that is speculative.
+  bool hasSpeculativeEarlyExit() const { return HasSpeculativeEarlyExit; }
+
+  /// Returns the early exiting block in a loop with a speculative backedge
+  /// count.
+  BasicBlock *getSpeculativeEarlyExitingBlock() const {
+    assert(getUncountableExitingBlocks().size() == 1 &&
+           "Expected only a single uncountable exiting block");
+    return getUncountableExitingBlocks()[0];
+  }
+
+  /// Returns the destination of an early exiting block in a loop with a
+  /// speculative backedge count.
+  BasicBlock *getSpeculativeEarlyExitBlock() const {
+    assert(getUncountableExitBlocks().size() == 1 &&
+           "Expected only a single uncountable exit block");
+    return getUncountableExitBlocks()[0];
+  }
+
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
   bool isMaskRequired(const Instruction *I) const {
@@ -404,6 +424,22 @@ class LoopVectorizationLegality {
 
   DominatorTree *getDominatorTree() const { return DT; }
 
+  /// Returns all exiting blocks with a countable exit, i.e. the
+  /// exit-not-taken count is known exactly at compile time.
+  const SmallVector<BasicBlock *, 4> &getCountableExitingBlocks() const {
+    return CountableExitingBlocks;
+  }
+
+  /// Returns all the exiting blocks with an uncountable exit.
+  const SmallVector<BasicBlock *, 4> &getUncountableExitingBlocks() const {
+    return UncountableExitingBlocks;
+  }
+
+  /// Returns all the exit blocks from uncountable exiting blocks.
+  SmallVector<BasicBlock *, 4> getUncountableExitBlocks() const {
+    return UncountableExitBlocks;
+  }
+
 private:
   /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
   /// its nested loops are considered legal for vectorization. These legal
@@ -436,7 +472,7 @@ class LoopVectorizationLegality {
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
   /// Returns true if the loop is vectorizable
-  bool canVectorizeMemory();
+  bool canVectorizeMemory(bool IsEarlyExitLoop);
 
   /// Return true if we can vectorize this loop using the IF-conversion
   /// transformation.
@@ -446,6 +482,14 @@ class LoopVectorizationLegality {
   /// specific checks for outer loop vectorization.
   bool canVectorizeOuterLoop();
 
+  /// Returns true if this is a supported early exit loop that we can
+  /// vectorize.
+  bool isVectorizableEarlyExitLoop();
+
+  /// Record information about the different exiting blocks, both countable and
+  /// uncountable.
+  void recordExitingBlocks();
+
   /// Return true if all of the instructions in the block can be speculatively
   /// executed, and record the loads/stores that require masking.
   /// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -551,6 +595,16 @@ class LoopVectorizationLegality {
   /// (potentially) make a better decision on the maximum VF and enable
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
+
+  /// Indicates whether this loop has a speculative early exit, i.e. an
+  /// uncountable exiting block that is not the latch.
+  bool HasSpeculativeEarlyExit = false;
+
+  /// Keeps track of all the exits with known or countable exit-not-taken
+  /// counts.
+  SmallVector<BasicBlock *, 4> CountableExitingBlocks;
+  SmallVector<BasicBlock *, 4> UncountableExitingBlocks;
+  SmallVector<BasicBlock *, 4> UncountableExitBlocks;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 84214c47a10e1..d7bdceaeb7c46 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2453,8 +2453,11 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       // vectorize a loop if it contains known function calls that don't set
       // the flag. Therefore, it is safe to ignore this read from memory.
       auto *Call = dyn_cast<CallInst>(&I);
-      if (Call && getVectorIntrinsicIDForCall(Call, TLI))
-        continue;
+      if (Call) {
+        NumCalls++;
+        if (getVectorIntrinsicIDForCall(Call, TLI))
+          continue;
+      }
 
       // If this is a load, save it. If this instruction can read from memory
       // but is not a load, then we quit. Notice that we don't handle function
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 51cffac808768..eb6348d7eed4d 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8609,6 +8609,23 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
   return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true);
 }
 
+void ScalarEvolution::BackedgeTakenInfo::getCountableExitingBlocks(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<BasicBlock *, 4> *Blocks) const {
+  const BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || !hasAnyInfo())
+    return;
+
+  for (const auto &ENT : ExitNotTaken) {
+    const SCEV *BECount = ENT.ExactNotTaken;
+    if (BECount == SE->getCouldNotCompute())
+      continue;
+    Blocks->push_back(ENT.ExitingBlock);
+  }
+
+  return;
+}
+
 /// Get the exact not taken count for this loop exit.
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
@@ -8620,6 +8637,15 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
   return SE->getCouldNotCompute();
 }
 
+bool ScalarEvolution::BackedgeTakenInfo::hasExact(
+    const BasicBlock *ExitingBlock, ScalarEvolution *SE) const {
+  for (const auto &ENT : ExitNotTaken)
+    if (ENT.ExitingBlock == ExitingBlock)
+      return ENT.ExactNotTaken != SE->getCouldNotCompute();
+
+  return false;
+}
+
 const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax(
     const BasicBlock *ExitingBlock, ScalarEvolution *SE) const {
   for (const auto &ENT : ExitNotTaken)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cafec165f6d6f..2ccabbd8de029 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,12 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool> AssumeNoMemFault(
+    "vectorizer-no-mem-fault", cl::init(false), cl::Hidden,
+    cl::desc("Assume vectorized loops will not have memory faults, which is "
+             "potentially unsafe but can be useful for testing vectorization "
+             "of early exit loops."));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -1054,7 +1060,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeMemory() {
+bool LoopVectorizationLegality::canVectorizeMemory(bool IsEarlyExitLoop) {
   LAI = &LAIs.getInfo(*TheLoop);
   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
   if (LAR) {
@@ -1076,6 +1082,57 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return false;
   }
 
+  // For loops with uncountable early exiting blocks that are not the latch
+  // it's necessary to perform extra checks, since the vectoriser is currently
+  // only capable of handling simple search loops.
+  if (IsEarlyExitLoop) {
+    // We don't support calls or any memory accesses that write to memory.
+    if (LAI->getNumStores()) {
+      reportVectorizationFailure(
+          "Writes to memory unsupported in early exit loops",
+          "Cannot vectorize early exit loop", "WritesInEarlyExitLoop", ORE,
+          TheLoop);
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Found at least one write to memory in early exit loop.\n");
+      return false;
+    }
+
+    if (LAI->getNumCalls()) {
+      reportVectorizationFailure("Calls unsupported in early exit loops",
+                                 "Cannot vectorize early exit loop",
+                                 "CallsInEarlyExitLoop", ORE, TheLoop);
+      LLVM_DEBUG(dbgs() << "LV: Found at least one call in early exit loop.\n");
+      return false;
+    }
+
+    // The vectoriser cannot handle loads that occur after the early exit block.
+    BasicBlock *LatchBB = TheLoop->getLoopLatch();
+    for (Instruction &I : *LatchBB) {
+      if (I.mayReadFromMemory()) {
+        reportVectorizationFailure("Loads not permitted after early exit",
+                                   "Cannot vectorize early exit loop",
+                                   "LoadsAfterEarlyExit", ORE, TheLoop);
+        LLVM_DEBUG(dbgs() << "LV: Found at least one load after early exit.\n");
+        return false;
+      }
+    }
+
+    // The vectoriser does not yet handle loops that may fault, but this will
+    // be improved in a follow-on patch.
+    if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC)) {
+      // This flag is intended for testing purposes only - enabling it is
+      // potentially unsafe.
+      if (!AssumeNoMemFault) {
+        LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot vectorize faulting "
+                          << "loop with early exit.\n");
+        return false;
+      } else
+        LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not "
+                          << "fault\n");
+    }
+  }
+
   // We can vectorize stores to invariant address when final reduction value is
   // guaranteed to be stored at the end of the loop. Also, if decision to
   // vectorize loop is made, runtime checks are added so as to make sure that
@@ -1439,6 +1496,103 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
   return Result;
 }
 
+void LoopVectorizationLegality::recordExitingBlocks() {
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  TheLoop->getExitingBlocks(ExitingBlocks);
+
+  SmallVector<BasicBlock *, 4> CountableExitingBBs;
+  PSE.getSE()->getCountableExitingBlocks(TheLoop, &CountableExitingBBs);
+
+  // There could be multiple exiting blocks with an exact exit-not-taken
+  // count. Find all of the uncountable early exit blocks, i.e. the ones with
+  // an unknown count.
+  SmallVector<BasicBlock *, 4> UncountableExitingBBs;
+  SmallVector<BasicBlock *, 4> UncountableExitBBs;
+  for (BasicBlock *BB1 : ExitingBlocks) {
+    if (!is_contained(CountableExitingBBs, BB1)) {
+      UncountableExitingBBs.push_back(BB1);
+
+      for (BasicBlock *BB2 : successors(BB1)) {
+        if (!TheLoop->contains(BB2)) {
+          UncountableExitBBs.push_back(BB2);
+          break;
+        }
+      }
+    }
+  }
+
+  CountableExitingBlocks = std::move(CountableExitingBBs);
+  UncountableExitingBlocks = std::move(UncountableExitingBBs);
+  UncountableExitBlocks = std::move(UncountableExitBBs);
+}
+
+bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
+  // At least one of the exiting blocks must be the latch.
+  BasicBlock *LatchBB = TheLoop->getLoopLatch();
+  if (!LatchBB) {
+    reportVectorizationFailure("Loop does not have a latch",
+                               "Cannot vectorize early exit loop",
+                               "NoLatchEarlyExit", ORE, TheLoop);
+    LLVM_DEBUG(dbgs() << "LV: Loop does not have a latch.\n");
+    return false;
+  }
+
+  recordExitingBlocks();
+
+  // We only support one uncountable early exit.
+  if (getUncountableExitingBlocks().size() != 1) {
+    reportVectorizationFailure("Loop has too many uncountable exits",
+                               "Cannot vectorize early exit loop",
+                               "TooManyUncountableEarlyExits", ORE, TheLoop);
+    LLVM_DEBUG(
+        dbgs() << "LV: Loop does not have one uncountable exiting block.\n");
+    return false;
+  }
+
+  // The only supported early exit loops so far are ones where the early
+  // exiting block is a unique predecessor of the latch block.
+  BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
+  if (!LatchPredBB || LatchPredBB != getUncountableExitingBlocks()[0]) {
+    reportVectorizationFailure("Early exit is not the latch predecessor",
+                               "Cannot vectorize early exit loop",
+                               "EarlyExitNotLatchPredecessor", ORE, TheLoop);
+    LLVM_DEBUG(
+        dbgs() << "LV: Early exit block is not unique predecessor of latch\n");
+    return false;
+  }
+
+  if (Reductions.size() || FixedOrderRecurrences.size()) {
+    reportVectorizationFailure(
+        "Found reductions or recurrences in early-exit loop",
+        "Cannot vectorize early exit loop", "RecurrencesInEarlyExitLoop", ORE,
+        TheLoop);
+    LLVM_DEBUG(
+        dbgs() << "LV: Found reductions/recurrences in early exit loop.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(
+      dbgs()
+      << "LV: Found an early exit. Retrying with speculative exit count.\n");
+  if (!PSE.getSE()->hasExactPredicatedBackedgeTakenCount(TheLoop, LatchBB)) {
+    reportVectorizationFailure(
+        "Cannot determine exact exit count for latch block",
+        "Cannot vectorize early exit loop",
+        "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
+    LLVM_DEBUG(
+        dbgs() << "LV: Uncountable exit for latch in early exit loop.\n");
+    return false;
+  }
+
+  const SCEV *SpecExitCount = PSE.getSymbolicMaxBackedgeTakenCount();
+  assert(!isa<SCEVCouldNotCompute>(SpecExitCount) &&
+         "Failed to get symbolic expression for backedge taken count");
+
+  LLVM_DEBUG(dbgs() << "LV: Found speculative backedge taken count: "
+                    << *SpecExitCount << '\n');
+  return true;
+}
+
 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   // Store the result and return it at the end instead of exiting early, in case
   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
@@ -1497,19 +1651,24 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
       return false;
   }
 
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeMemory()) {
-    LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
+  HasSpeculativeEarlyExit = false;
+  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+    if (!isVectorizableEarlyExitLoop()) {
+      reportVectorizationFailure(
+          "could not determine number of loop iterations",
+          "could not determine number of loop iterations",
+          "CantComputeNumberOfIterations", ORE, TheLoop);
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    } else
+      HasSpeculativeEarlyExit = true;
   }
 
-  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
-    reportVectorizationFailure("could not determine number of loop iterations",
-                               "could not determine number of loop iterations",
-                               "CantComputeNumberOfIterations", ORE, TheLoop);
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeMemory(HasSpeculativeEarlyExit)) {
+    LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
     if (DoExtraAnalysis)
       Result = false;
     else
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d28b8fabe42e..47f0fea60b1a8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -182,6 +182,10 @@ static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization of epilogue loops."));
 
+static cl::opt<bool> EnableEarlyExitVectorization(
+    "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of early exit loops."));
+
 static cl::opt<unsigned> EpilogueVectorizationForceVF(
     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
     cl::desc("When epilogue vectorization is enabled, and a value greater than "
@@ -560,8 +564,12 @@ class InnerLoopVectorizer {
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                     Value *VectorTripCount, Value *EndValue,
-                    BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
-                    VPlan &Plan, VPTransformState &State);
+                    BasicBlock *MiddleBlock, VPlan &Plan,
+                    VPTransformState &State);
+
+  void fixupEarlyExitIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+                             BasicBlock *VectorEarlyExitBB, VPlan &Plan,
+                             VPTransformState &State);
 
   /// Iteratively sink the scalarized operands of a predicated instruction into
   /// the block that was created for it.
@@ -927,7 +935,7 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
 
 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
                                 Loop *OrigLoop) {
-  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+  const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
 
   ScalarEvolution &SE = *PSE.getSE();
@@ -1403,11 +1411,28 @@ class LoopVectorizationCostModel {
     }
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    if (!Legal->hasSpeculativeEarlyExit() &&
+        TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
       LLVM_DEBUG(
           dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
       return true;
     }
+    // If this is a loop with a speculative early exit, then we may validly
+    // exit from a non-latch block and not require a scalar epilogue for the
+    // last iteration, since these exits are handled specially. However, since
+    // we could have both countable and speculative exits we must search all
+    // the exits.
+    if (Legal->hasSpeculativeEarlyExit()) {
+      const SmallVector<BasicBlock *, 4> &CountableExitingBlocks =
+          Legal->getCountableExitingBlocks();
+      if (CountableExitingBlocks.size() > 1 ||
+          (CountableExitingBlocks.size() == 1 &&
+           CountableExitingBlocks[0] != TheLoop->getLoopLatch())) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
+    }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
                            "interleaved group requires scalar epilogue\n");
@@ -1836,8 +1861,9 @@ class GeneratedRTChecks {
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
                     bool AddBranchWeights)
-      : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
-        MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
+      : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check", true),
+        MemCheckExp(SE, DL, "scev.check", true),
+        AddBranchWeights(AddBranchWeights) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -2591,7 +2617,12 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
-  LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
+
+  if (Cost->Legal->hasSpeculativeEarlyExit())
+    LoopExitBlock = OrigLoop->getLatchExitBlock();
+  else
+    LoopExitBlock = OrigLoop->getUniqueExitBlock();
+
   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
          "multiple exit loop without required epilogue?");
 
@@ -2767,26 +2798,33 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                                        const InductionDescriptor &II,
                                        Value *VectorTripCount, Value *EndValue,
-                                       BasicBlock *MiddleBlock,
-                                       BasicBlock *VectorHeader, VPlan &Plan,
+                                       BasicBlock *MiddleBlock, VPlan &Plan,
                                        VPTransformState &State) {
   // There are two kinds of external IV usages - those that use the value
   // computed in the last iteration (the PHI) and those that use the penultimate
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
+  assert((OrigLoop->getUniqueExitBlock() || Legal->hasSpeculativeEarlyExit()) &&
+         "Expected a single exit block");
 
   DenseMap<Value *, Value *> MissingVals;
 
   // An external user of the last iteration's value should see the value that
   // the remainder loop uses to initialize its own IV.
-  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+  BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
   for (User *U : PostInc->users()) {
     Instruction *UI = cast<Instruction>(U);
     if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
-      MissingVals[UI] = EndValue;
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      // Just because the user is outside the loop it doesn't mean the incoming
+      // value is always from the latch block. This could be an early exit loop
+      // with multiple paths to the same successor.
+      int Index = PHI->getBasicBlockIndex(OrigLoopLatch);
+      if (Index != -1 && PHI->getIncomingValue(Index) == PostInc)
+        MissingVals[PHI] = EndValue;
     }
   }
 
@@ -2796,7 +2834,12 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   for (User *U : OrigPhi->users()) {
     auto *UI = cast<Instruction>(U);
     if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      int Index = PHI->getBasicBlockIndex(OrigLoopLatch);
+      if (Index == -1 || PHI->getIncomingValue(Index) != OrigPhi)
+        continue;
+
       IRBuilder<> B(MiddleBlock->getTerminator());
 
       // Fast-math-flags propagate from the original induction instruction.
@@ -2833,6 +2876,94 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   }
 }
 
+void InnerLoopVectorizer::fixupEarlyExitIVUsers(PHINode *OrigPhi,
+                                                const InductionDescriptor &II,
+                                                BasicBlock *VectorEarlyExitBB,
+                                                VPlan &Plan,
+                                                VPTransformState &State) {
+  // There are two kinds of external IV usages - those that use the value
+  // computed in the last iteration (the PHI) and those that use the penultimate
+  // value (the value that feeds into the phi from the loop latch).
+  // We allow both, but they, obviously, have different values.
+  DenseMap<Value *, Value *> MissingVals;
+  BasicBlock *OrigEarlyExitingBlock = Legal->getSpeculativeEarlyExitingBlock();
+  BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
+
+  auto FixUpPhi = [&](Instruction *UI, bool PostInc) -> Value * {
+    IRBuilder<> B(VectorEarlyExitBB->getTerminator());
+    assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+    // Fast-math-flags propagate from the original induction instruction.
+    if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+      B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
+
+    // We need to discover the mask that led us into the early exit block.
+    Value *EarlyExitMask = Plan.getVectorLoopRegion()->getEarlyExitMask(&State);
+    VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+    Type *CtzType = CanonicalIV->getStartValue()->getLiveInIRValue()->getType();
+    Value *Ctz;
+    if (EarlyExitMask)
+      Ctz = B.CreateCountTrailingZeroElems(CtzType, EarlyExitMask);
+    else
+      Ctz = ConstantInt::get(CtzType, 0);
+    Ctz = B.CreateAdd(Ctz,
+                      cast<PHINode>(State.get(CanonicalIV->getVPSingleValue(),
+                                              0, /*IsScalar=*/true)));
+    if (PostInc)
+      Ctz = B.CreateAdd(Ctz, ConstantInt::get(CtzType, 1));
+
+    Value *Escape = nullptr;
+    VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
+    assert(StepVPV && "step must have been expanded during VPlan execution");
+    Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
+                                      : State.get(StepVPV, {0, 0});
+    Escape = emitTransformedIndex(B, Ctz, II.getStartValue(), Step,
+                                  II.getKind(), II.getInductionBinOp());
+    Escape->setName("ind.early.escape");
+
+    return Escape;
+  };
+
+  const Loop *L = this->OrigLoop;
+  auto isUsedInEarlyExitBlock =
+      [&L, &OrigEarlyExitingBlock](Value *V, Instruction *UI) -> bool {
+    if (!L->contains(UI)) {
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+      if (Index != -1 && PHI->getIncomingValue(Index) == V)
+        return true;
+    }
+    return false;
+  };
+
+  for (User *U : PostInc->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (isUsedInEarlyExitBlock(PostInc, UI))
+      MissingVals[UI] = FixUpPhi(UI, true);
+  }
+
+  for (User *U : OrigPhi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (isUsedInEarlyExitBlock(OrigPhi, UI))
+      MissingVals[UI] = FixUpPhi(UI, false);
+  }
+
+  for (auto &I : MissingVals) {
+    PHINode *PHI = cast<PHINode>(I.first);
+    // One corner case we have to handle is two IVs "chasing" each-other,
+    // that is %IV2 = phi [...], [ %IV1, %latch ]
+    // In this case, if IV1 has an external use, we need to avoid adding both
+    // "last value of IV1" and "penultimate value of IV2". So, verify that we
+    // don't already have an incoming value for the middle block.
+    if (PHI->getBasicBlockIndex(VectorEarlyExitBB) == -1) {
+      PHI->addIncoming(I.second, VectorEarlyExitBB);
+      Plan.removeEarlyExitLiveOut(PHI);
+    }
+  }
+}
+
 namespace {
 
 struct CSEDenseMapInfo {
@@ -2965,6 +3096,22 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
+
+  BasicBlock *VectorEarlyExitBB = nullptr;
+  if (VectorRegion->getEarlyExit()) {
+    // Fix-up external users of the induction variables.
+    VPBasicBlock *VectorEarlyExitVPBB =
+        cast<VPBasicBlock>(VectorRegion->getEarlyExit());
+    VectorEarlyExitBB = State.CFG.VPBB2IRBB[VectorEarlyExitVPBB];
+    for (const auto &Entry : Legal->getInductionVars())
+      fixupEarlyExitIVUsers(Entry.first, Entry.second, VectorEarlyExitBB, Plan,
+                            State);
+
+    BasicBlock *OrigEarlyExitBB = Legal->getSpeculativeEarlyExitBlock();
+    if (Loop *EEL = LI->getLoopFor(OrigEarlyExitBB))
+      EEL->addBasicBlockToLoop(VectorEarlyExitBB, *LI);
+  }
+
   if (Cost->requiresScalarEpilogue(VF.isVector())) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
@@ -2981,14 +3128,19 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
     for (const auto &Entry : Legal->getInductionVars())
       fixupIVUsers(Entry.first, Entry.second,
                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
-                   IVEndValues[Entry.first], LoopMiddleBlock,
-                   VectorLoop->getHeader(), Plan, State);
+                   IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
   }
 
   // Fix live-out phis not already fixed earlier.
   for (const auto &KV : Plan.getLiveOuts())
     KV.second->fixPhi(Plan, State);
 
+  if (VectorEarlyExitBB) {
+    State.Builder.SetInsertPoint(VectorEarlyExitBB->getTerminator());
+    for (const auto &KV : Plan.getEarlyExitLiveOuts())
+      KV.second->fixPhi(Plan, State);
+  }
+
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
 
@@ -3599,9 +3751,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   SmallVector<BasicBlock *> Exiting;
   TheLoop->getExitingBlocks(Exiting);
   for (BasicBlock *E : Exiting) {
-    auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
-      addToWorklistIfAllowed(Cmp);
+    if (!Legal->hasSpeculativeEarlyExit() ||
+        E != Legal->getSpeculativeEarlyExitingBlock()) {
+      auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
+      if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+        addToWorklistIfAllowed(Cmp);
+    }
   }
 
   auto PrevVF = VF.divideCoefficientBy(2);
@@ -4050,7 +4205,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // a bottom-test and a single exiting block. We'd have to handle the fact
   // that not every instruction executes on the last iteration.  This will
   // require a lane mask which varies through the vector loop body.  (TODO)
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+  if (Legal->hasSpeculativeEarlyExit() ||
+      TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
     // If there was a tail-folding hint/switch, but we can't fold the tail by
     // masking, fallback to a vectorization with a scalar epilogue.
     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
@@ -4619,7 +4775,9 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
-  if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
+  // TODO: Add support for loops with an early exit.
+  if (Legal->hasSpeculativeEarlyExit() ||
+      OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
     return false;
 
   return true;
@@ -4853,6 +5011,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
 
+  // We don't attempt to perform interleaving for early exit loops.
+  if (Legal->hasSpeculativeEarlyExit())
+    return 1;
+
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
   const bool HasReductions = !Legal->getReductionVars().empty();
 
@@ -6417,7 +6579,21 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
       // The back-edge branch will remain, as will all scalar branches.
       return TTI.getCFInstrCost(Instruction::Br, CostKind);
-    else
+    else if (Legal->hasSpeculativeEarlyExit() &&
+             I->getParent() == Legal->getSpeculativeEarlyExitingBlock()) {
+      // In order to determine whether we take an early exit or not we have to
+      // perform an or reduction of the vector predicate.
+      auto *Vec_i1Ty =
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      InstructionCost EECost = TTI.getArithmeticReductionCost(
+          Instruction::Or, Vec_i1Ty, std::nullopt, CostKind);
+      // Add on the cost of the conditional branch, which will remain.
+      EECost += TTI.getCFInstrCost(Instruction::Br, CostKind);
+      // TODO: The vector loop early exit block also needs to do work to
+      // determine the first lane that triggered the exit. We should probably
+      // add that somehow, but the cost will be negligible for long loops.
+      return EECost;
+    } else
       // This branch will be eliminated by if-conversion.
       return 0;
     // Note: We currently assume zero cost for an unconditional branch inside
@@ -8371,13 +8547,19 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
 // original exit block.
-static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
-                                VPRecipeBuilder &Builder, VPlan &Plan) {
-  BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
-  BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
-  // Only handle single-exit loops with unique exit blocks for now.
-  if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
-    return;
+static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder,
+                                VPlan &Plan) {
+  BasicBlock *ExitBB, *ExitingBB;
+
+  if (Plan.getVectorLoopRegion()->getEarlyExit()) {
+    ExitingBB = OrigLoop->getLoopLatch();
+    ExitBB = OrigLoop->getLatchExitBlock();
+  } else {
+    ExitBB = OrigLoop->getUniqueExitBlock();
+    ExitingBB = OrigLoop->getExitingBlock();
+    if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
+      return;
+  }
 
   // Introduce VPUsers modeling the exit values.
   for (PHINode &ExitPhi : ExitBB->phis()) {
@@ -8449,6 +8631,52 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
   }
 }
 
+// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
+// original exit block.
+static void addUsersInEarlyExitBlock(Loop *OrigLoop, BasicBlock *EarlyExitingBB,
+                                     BasicBlock *EarlyExitBB,
+                                     VPRecipeBuilder &Builder, VPlan &Plan) {
+  // Introduce VPUsers modeling the exit values.
+  for (PHINode &ExitPhi : EarlyExitBB->phis()) {
+    Value *IncomingValue = ExitPhi.getIncomingValueForBlock(EarlyExitingBB);
+    VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
+    Plan.addEarlyExitLiveOut(&ExitPhi, V);
+  }
+}
+
+static VPValue *getConditionForVectorEarlyExit(Loop *OrigLoop,
+                                               BasicBlock *ExitingBB,
+                                               VPlan &Plan, VPBuilder &Builder,
+                                               VPRecipeBuilder &RecipeBuilder,
+                                               VPRecipeBase *VPEarlyExitCond) {
+  // To make things easier we canonicalise the condition so that 'true'
+  // means take the early exit.
+  auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+
+  // If the true destination is in the loop then we want to invert the
+  // condition so that true means early exit.
+  bool NeedsInvert = OrigLoop->contains(BI->getSuccessor(0));
+
+  VPValue *ScalarExitCond;
+  if (!VPEarlyExitCond) {
+    // If we didn't find the exit condition, then this must have been
+    // defined outside the loop and is loop invariant.
+    ScalarExitCond =
+        RecipeBuilder.getVPValueOrAddLiveIn(BI->getCondition(), Plan);
+    if (NeedsInvert)
+      ScalarExitCond = Builder.createNot(ScalarExitCond);
+  } else {
+    VPValue *EarlyExitMask = VPEarlyExitCond->getVPSingleValue();
+    if (NeedsInvert)
+      EarlyExitMask = Builder.createNot(EarlyExitMask);
+    Plan.getVectorLoopRegion()->setVectorEarlyExitCond(EarlyExitMask);
+    // If any lane of EarlyExitMask would be true we should exit the loop.
+    ScalarExitCond =
+        Builder.createNaryOp(VPInstruction::OrReduction, {EarlyExitMask});
+  }
+  return ScalarExitCond;
+}
+
 VPlanPtr
 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
@@ -8471,10 +8699,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
             return !CM.requiresScalarEpilogue(VF.isVector());
           },
           Range);
+
+  BasicBlock *EarlyExitingBB = nullptr, *EarlyExitBB = nullptr;
+  if (Legal->hasSpeculativeEarlyExit()) {
+    EarlyExitingBB = Legal->getSpeculativeEarlyExitingBlock();
+    EarlyExitBB = Legal->getSpeculativeEarlyExitBlock();
+  }
   VPlanPtr Plan = VPlan::createInitialVPlan(
       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
       *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
-      OrigLoop);
+      OrigLoop, EarlyExitingBB, EarlyExitBB);
 
   // Don't use getDecisionAndClampRange here, because we don't know the UF
   // so this function is better to be conservative, rather than to split
@@ -8537,6 +8771,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
+
+  // If we find the recipe for the early exit condition we need to record it
+  // so that we can then generate the new vector exit condition.
+  VPRecipeBase *VPEarlyExitCond = nullptr;
+  Value *EarlyExitCond = nullptr;
+  VPBlockBase *EarlyExitVPBB = nullptr;
+  if (EarlyExitingBB) {
+    EarlyExitVPBB = Plan->getVectorLoopRegion()->getEarlyExit();
+    BranchInst *BI = cast<BranchInst>(EarlyExitingBB->getTerminator());
+    EarlyExitCond = BI->getCondition();
+  }
+
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
@@ -8575,6 +8821,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       if (!Recipe)
         Recipe = RecipeBuilder.handleReplication(Instr, Range);
 
+      if (&I == EarlyExitCond)
+        VPEarlyExitCond = Recipe;
+
       RecipeBuilder.setRecipe(Instr, Recipe);
       if (isa<VPHeaderPHIRecipe>(Recipe)) {
         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
@@ -8593,19 +8842,47 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
-    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
-    VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+    // If this is an early exit block we need to do more work to generate the
+    // actual exit condition. We generate an or reduction of the vector
+    // condition so that we exit the loop if any lane of the vector would cause
+    // us to exit.
+    if (BB == EarlyExitingBB) {
+      VPValue *ScalarExitCond = getConditionForVectorEarlyExit(
+          OrigLoop, BB, *Plan, Builder, RecipeBuilder, VPEarlyExitCond);
+
+      // Branch to early exit BB.
+      auto *NewBR =
+          new VPInstruction(VPInstruction::BranchOnCond, {ScalarExitCond});
+      RecipeBuilder.setRecipe(cast<BranchInst>(BB->getTerminator()), NewBR);
+      VPBB->appendRecipe(NewBR);
+
+      VPBasicBlock *InLoopVPBB = new VPBasicBlock();
+
+      // Surely there should only be one succesor?!
+      VPBlockBase *Successor = VPBB->getSingleSuccessor();
+      VPBlockUtils::disconnectBlocks(VPBB, Successor);
+      VPBlockUtils::insertTwoBlocksAfter(EarlyExitVPBB, InLoopVPBB, VPBB);
+      VPBlockUtils::connectBlocks(InLoopVPBB, Successor);
+
+      VPBB = InLoopVPBB;
+    } else {
+      VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+      VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+    }
   }
 
   // After here, VPBB should not be used.
   VPBB = nullptr;
-
+  if (EarlyExitingBB)
+    addUsersInEarlyExitBlock(OrigLoop, EarlyExitingBB,
+                             Legal->getSpeculativeEarlyExitBlock(),
+                             RecipeBuilder, *Plan);
   if (CM.requiresScalarEpilogue(Range)) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
   } else
-    addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
+    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan);
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -8722,7 +8999,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   // Create new empty VPlan
   auto Plan = VPlan::createInitialVPlan(
       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
-      *PSE.getSE(), true, false, OrigLoop);
+      *PSE.getSE(), true, false, OrigLoop, nullptr, nullptr);
 
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -9533,15 +9810,71 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
   }
 }
 
-static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
-                                       VectorizationFactor &VF,
-                                       std::optional<unsigned> VScale, Loop *L,
-                                       ScalarEvolution &SE,
-                                       ScalarEpilogueLowering SEL) {
+static InstructionCost calculateEarlyExitCost(const TargetTransformInfo *TTI,
+                                              LoopVectorizationLegality *Legal,
+                                              Loop *L, ElementCount VF) {
+  unsigned NumCttzElemCalls = 0;
+  BasicBlock *OrigEarlyExitingBlock = Legal->getSpeculativeEarlyExitingBlock();
+  BasicBlock *OrigLoopLatch = L->getLoopLatch();
+
+  auto isUsedInEarlyExitBlock = [&L, &OrigEarlyExitingBlock](Value *V,
+                                                             User *U) -> bool {
+    auto *UI = cast<Instruction>(U);
+    if (!L->contains(UI)) {
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+      if (Index != -1 && PHI->getIncomingValue(Index) == V)
+        return true;
+    }
+    return false;
+  };
+
+  for (const auto &Entry : Legal->getInductionVars()) {
+    PHINode *OrigPhi = Entry.first;
+    Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
+
+    for (User *U : PostInc->users())
+      if (isUsedInEarlyExitBlock(PostInc, U))
+        NumCttzElemCalls++;
+
+    for (User *U : OrigPhi->users())
+      if (isUsedInEarlyExitBlock(OrigPhi, U))
+        NumCttzElemCalls++;
+  }
+
+  InstructionCost Cost = 0;
+  if (NumCttzElemCalls) {
+    LLVMContext &Context = L->getHeader()->getContext();
+    // Ideally we'd query the vplan for the canonical IV type, but we don't
+    // have a vplan yet so let's assume it's 64-bit.
+    auto CtzType = IntegerType::getIntNTy(Context, 64);
+    auto VecI1Type = VectorType::get(IntegerType::getInt1Ty(Context), VF);
+
+    IntrinsicCostAttributes Attrs(
+        Intrinsic::experimental_cttz_elts, CtzType,
+        {PoisonValue::get(VecI1Type), ConstantInt::getTrue(Context)});
+    Cost = TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_RecipThroughput);
+    Cost *= NumCttzElemCalls;
+  }
+  return Cost;
+}
+
+static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
+                                        VectorizationFactor &VF,
+                                        std::optional<unsigned> VScale, Loop *L,
+                                        ScalarEvolution &SE,
+                                        ScalarEpilogueLowering SEL,
+                                        InstructionCost EarlyExitCost) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
 
+  // Add on the cost of work required in the vector early exit block, if one
+  // exists.
+  if (EarlyExitCost.isValid())
+    CheckCost += EarlyExitCost;
+
   // When interleaving only scalar and vector cost will be equal, which in turn
   // would lead to a divide by 0. Fall back to hard threshold.
   if (VF.Width.isScalar()) {
@@ -9690,6 +10023,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
+  if (LVL.hasSpeculativeEarlyExit() && !EnableEarlyExitVectorization) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Auto-vectorization of early "
+                      << "exit loops is disabled.\n");
+    return false;
+  }
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
@@ -9828,12 +10167,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1)
       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
 
+    InstructionCost EarlyExitCost = InstructionCost::getInvalid();
+    if (VF.Width.isVector() && LVL.hasSpeculativeEarlyExit())
+      EarlyExitCost = calculateEarlyExitCost(TTI, &LVL, L, VF.Width);
+
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    *PSE.getSE(), SEL)) {
+        !isOutsideLoopWorkProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
+                                     *PSE.getSE(), SEL, EarlyExitCost)) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 58de6256900f0..53c79f256ea90 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -514,7 +514,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
     // Register NewBB in its loop. In innermost loops its the same for all
     // BB's.
-    if (State->CurrentVectorLoop)
+    if (State->CurrentVectorLoop &&
+        this != getPlan()->getVectorLoopRegion()->getEarlyExit())
       State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
     State->Builder.SetInsertPoint(Terminator);
     State->CFG.PrevBB = NewBB;
@@ -839,6 +840,10 @@ VPlan::~VPlan() {
     delete KV.second;
   LiveOuts.clear();
 
+  for (auto &KV : EarlyExitLiveOuts)
+    delete KV.second;
+  EarlyExitLiveOuts.clear();
+
   if (Entry) {
     VPValue DummyValue;
     for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -857,7 +862,9 @@ VPlan::~VPlan() {
 
 VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
                                    bool RequiresScalarEpilogueCheck,
-                                   bool TailFolded, Loop *TheLoop) {
+                                   bool TailFolded, Loop *TheLoop,
+                                   BasicBlock *EarlyExitingBB,
+                                   BasicBlock *EarlyExitBB) {
   VPIRBasicBlock *Entry = new VPIRBasicBlock(TheLoop->getLoopPreheader());
   VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
   auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
@@ -875,6 +882,12 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
+  if (EarlyExitingBB) {
+    VPBasicBlock *EarlyExitVPBB = new VPBasicBlock("vector.early.exit");
+    Plan->getVectorLoopRegion()->setEarlyExit(EarlyExitVPBB);
+    Plan->getVectorLoopRegion()->setOrigEarlyExit(EarlyExitBB);
+  }
+
   VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   if (!RequiresScalarEpilogueCheck) {
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -889,7 +902,7 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
   // 2) If we require a scalar epilogue, there is no conditional branch as
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
-  BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
+  BasicBlock *IRExitBlock = TheLoop->getLatchExitBlock();
   auto *VPExitBlock = new VPIRBasicBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
@@ -1077,6 +1090,20 @@ void VPlan::execute(VPTransformState *State) {
     }
   }
 
+  // Patch up early exiting vector block to jump to the original scalar loop's
+  // early exit block.
+  if (getVectorLoopRegion()->getEarlyExit()) {
+    VPBasicBlock *EarlyExitVPBB =
+        cast<VPBasicBlock>(getVectorLoopRegion()->getEarlyExit());
+    BasicBlock *VectorEarlyExitBB = State->CFG.VPBB2IRBB[EarlyExitVPBB];
+    BasicBlock *OrigEarlyExitBB = getVectorLoopRegion()->getOrigEarlyExit();
+    BranchInst *BI = BranchInst::Create(OrigEarlyExitBB);
+    BI->insertBefore(VectorEarlyExitBB->getTerminator());
+    VectorEarlyExitBB->getTerminator()->eraseFromParent();
+    State->CFG.DTU.applyUpdates(
+        {{DominatorTree::Insert, VectorEarlyExitBB, OrigEarlyExitBB}});
+  }
+
   State->CFG.DTU.flush();
   assert(State->CFG.DTU.getDomTree().verify(
              DominatorTree::VerificationLevel::Fast) &&
@@ -1143,6 +1170,12 @@ void VPlan::print(raw_ostream &O) const {
     KV.second->print(O, SlotTracker);
   }
 
+  if (!EarlyExitLiveOuts.empty())
+    O << "\n";
+  for (const auto &KV : EarlyExitLiveOuts) {
+    KV.second->print(O, SlotTracker);
+  }
+
   O << "}\n";
 }
 
@@ -1184,6 +1217,12 @@ void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
 }
 
+void VPlan::addEarlyExitLiveOut(PHINode *PN, VPValue *V) {
+  assert(EarlyExitLiveOuts.count(PN) == 0 &&
+         "an exit value for PN already exists");
+  EarlyExitLiveOuts.insert({PN, new VPLiveOut(PN, V, true)});
+}
+
 static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
                           DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
   // Update the operands of all cloned recipes starting at NewEntry. This
@@ -1254,6 +1293,10 @@ VPlan *VPlan::duplicate() {
   for (const auto &[_, LO] : LiveOuts)
     NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
 
+  for (const auto &[_, LO] : EarlyExitLiveOuts)
+    NewPlan->addEarlyExitLiveOut(LO->getPhi(),
+                                 Old2NewVPValues[LO->getOperand(0)]);
+
   // Initialize remaining fields of cloned VPlan.
   NewPlan->VFs = VFs;
   NewPlan->UFs = UFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0b596e7e4f633..4a6ee148f2eae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -704,9 +704,14 @@ class VPBlockBase {
 class VPLiveOut : public VPUser {
   PHINode *Phi;
 
+  /// Is this a live-out value specifically for an early exit from the vector
+  /// loop? If so, it needs handling specially.
+  bool EarlyExit;
+
 public:
-  VPLiveOut(PHINode *Phi, VPValue *Op)
-      : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
+  VPLiveOut(PHINode *Phi, VPValue *Op, bool EarlyExit = false)
+      : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi),
+        EarlyExit(EarlyExit) {}
 
   static inline bool classof(const VPUser *U) {
     return U->getVPUserID() == VPUser::VPUserID::LiveOut;
@@ -1262,6 +1267,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    OrReduction,
   };
 
 private:
@@ -3139,7 +3145,9 @@ class VPIRBasicBlock : public VPBasicBlock {
 };
 
 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
-/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
+/// which form a Single-Entry-Single-Exiting or Single-Entry-Multiple-Exiting
+/// subgraph of the output IR CFG. For the multiple-exiting case we currently
+/// only support a total of two exits and the early exit is tracked separately.
 /// A VPRegionBlock may indicate that its contents are to be replicated several
 /// times. This is designed to support predicated scalarization, in which a
 /// scalar if-then code structure needs to be generated VF * UF times. Having
@@ -3154,6 +3162,18 @@ class VPRegionBlock : public VPBlockBase {
   /// VPRegionBlock.
   VPBlockBase *Exiting;
 
+  /// Hold the Early Exit block of the SEME region, if one exists.
+  VPBlockBase *EarlyExit;
+
+  /// We need to keep track of the early exit block from the original scalar
+  /// loop in order to update the dominator tree correctly, since the vector
+  /// early exit will also jump to the original.
+  BasicBlock *OrigEarlyExit;
+
+  /// If one exists, this keeps track of the vector early mask that triggered
+  /// the early exit.
+  VPValue *VectorEarlyExitCond;
+
   /// An indicator whether this region is to generate multiple replicated
   /// instances of output IR corresponding to its VPBlockBases.
   bool IsReplicator;
@@ -3162,7 +3182,8 @@ class VPRegionBlock : public VPBlockBase {
   VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
                 const std::string &Name = "", bool IsReplicator = false)
       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
-        IsReplicator(IsReplicator) {
+        EarlyExit(nullptr), OrigEarlyExit(nullptr),
+        VectorEarlyExitCond(nullptr), IsReplicator(IsReplicator) {
     assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
     assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
     Entry->setParent(this);
@@ -3170,7 +3191,8 @@ class VPRegionBlock : public VPBlockBase {
   }
   VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
-        IsReplicator(IsReplicator) {}
+        EarlyExit(nullptr), OrigEarlyExit(nullptr),
+        VectorEarlyExitCond(nullptr), IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
     if (Entry) {
@@ -3188,6 +3210,14 @@ class VPRegionBlock : public VPBlockBase {
   const VPBlockBase *getEntry() const { return Entry; }
   VPBlockBase *getEntry() { return Entry; }
 
+  /// Returns the early exit vector mask, if one exists.
+  Value *getEarlyExitMask(VPTransformState *State) {
+    return VectorEarlyExitCond ? State->get(VectorEarlyExitCond, 0) : nullptr;
+  }
+
+  /// Sets the early exit vector mask.
+  void setVectorEarlyExitCond(VPValue *V) { VectorEarlyExitCond = V; }
+
   /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
   /// EntryBlock must have no predecessors.
   void setEntry(VPBlockBase *EntryBlock) {
@@ -3209,6 +3239,22 @@ class VPRegionBlock : public VPBlockBase {
     ExitingBlock->setParent(this);
   }
 
+  void setEarlyExit(VPBlockBase *ExitBlock) {
+    assert(ExitBlock->getSuccessors().empty() &&
+           "Exit block cannot have successors.");
+    EarlyExit = ExitBlock;
+    ExitBlock->setParent(this);
+  }
+
+  const VPBlockBase *getEarlyExit() const { return EarlyExit; }
+  VPBlockBase *getEarlyExit() { return EarlyExit; }
+
+  void setOrigEarlyExit(BasicBlock *EarlyExitBlock) {
+    OrigEarlyExit = EarlyExitBlock;
+  }
+
+  BasicBlock *getOrigEarlyExit() { return OrigEarlyExit; }
+
   /// Returns the pre-header VPBasicBlock of the loop region.
   VPBasicBlock *getPreheaderVPBB() {
     assert(!isReplicator() && "should only get pre-header of loop regions");
@@ -3300,6 +3346,9 @@ class VPlan {
   /// live-outs are fixed via VPLiveOut::fixPhi.
   MapVector<PHINode *, VPLiveOut *> LiveOuts;
 
+  /// Values used outside the plan.
+  MapVector<PHINode *, VPLiveOut *> EarlyExitLiveOuts;
+
   /// Mapping from SCEVs to the VPValues representing their expansions.
   /// NOTE: This mapping is temporary and will be removed once all users have
   /// been modeled in VPlan directly.
@@ -3340,7 +3389,9 @@ class VPlan {
   static VPlanPtr createInitialVPlan(const SCEV *TripCount,
                                      ScalarEvolution &PSE,
                                      bool RequiresScalarEpilogueCheck,
-                                     bool TailFolded, Loop *TheLoop);
+                                     bool TailFolded, Loop *TheLoop,
+                                     BasicBlock *EarlyExitingBB,
+                                     BasicBlock *EarlyExitBB);
 
   /// Prepare the plan for execution, setting up the required live-in values.
   void prepareToExecute(Value *TripCount, Value *VectorTripCount,
@@ -3480,6 +3531,17 @@ class VPlan {
     return LiveOuts;
   }
 
+  void addEarlyExitLiveOut(PHINode *PN, VPValue *V);
+
+  void removeEarlyExitLiveOut(PHINode *PN) {
+    delete EarlyExitLiveOuts[PN];
+    EarlyExitLiveOuts.erase(PN);
+  }
+
+  const MapVector<PHINode *, VPLiveOut *> &getEarlyExitLiveOuts() const {
+    return EarlyExitLiveOuts;
+  }
+
   VPValue *getSCEVExpansion(const SCEV *S) const {
     return SCEVToExpansion.lookup(S);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1b787d0490672..bc949fa7201a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -197,24 +197,43 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
   auto Lane = vputils::isUniformAfterVectorization(ExitValue)
                   ? VPLane::getFirstLane()
                   : VPLane::getLastLaneForVF(State.VF);
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
-  VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
-  auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
-  // Values leaving the vector loop reach live out phi's in the exiting block
-  // via middle block.
-  auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
-                       ? MiddleVPBB
-                       : ExitingVPBB;
+
+  VPBasicBlock *PredVPBB;
+  if (EarlyExit)
+    PredVPBB = cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEarlyExit());
+  else {
+    VPBasicBlock *MiddleVPBB =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+    VPRecipeBase *ExitRecipe = ExitValue->getDefiningRecipe();
+    auto *ExitVPBB = ExitRecipe ? ExitRecipe->getParent() : nullptr;
+    // Values leaving the vector loop reach live out phi's in the exiting block
+    // via middle block.
+    PredVPBB = !ExitVPBB || ExitVPBB->getEnclosingLoopRegion()
+                      ? MiddleVPBB
+                      : ExitVPBB;
+  }
+
   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
+
   // Set insertion point in PredBB in case an extract needs to be generated.
   // TODO: Model extracts explicitly.
   State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
-  Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+
+  Value *NewIncoming = nullptr;
+  if (!Lane.isFirstLane() && EarlyExit) {
+    assert(State.UF == 1 && "Early exits unsupported for unrolled loops");
+    NewIncoming = State.get(ExitValue, 0);
+    Value *EarlyExitMask = Plan.getVectorLoopRegion()->getEarlyExitMask(&State);
+    Value *Ctz = State.Builder.CreateCountTrailingZeroElems(
+        State.Builder.getInt64Ty(), EarlyExitMask);
+    NewIncoming = State.Builder.CreateExtractElement(NewIncoming, Ctz);
+  } else
+    NewIncoming = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+
   if (Phi->getBasicBlockIndex(PredBB) != -1)
-    Phi->setIncomingValueForBlock(PredBB, V);
+    Phi->setIncomingValueForBlock(PredBB, NewIncoming);
   else
-    Phi->addIncoming(V, PredBB);
+    Phi->addIncoming(NewIncoming, PredBB);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -350,6 +369,8 @@ bool VPInstruction::doesGeneratePerAllLanes() const {
   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
 }
 
+// TODO: Can this function be made static given it's only ever called from one
+// place in this file?
 bool VPInstruction::canGenerateScalarForFirstLane() const {
   if (Instruction::isBinaryOp(getOpcode()))
     return true;
@@ -361,6 +382,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::BranchOnCount:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
+  case VPInstruction::OrReduction:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
     return true;
@@ -673,7 +695,10 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     }
     return NewPhi;
   }
-
+  case VPInstruction::OrReduction: {
+    Value *Val = State.get(getOperand(0), Part);
+    return Builder.CreateOrReduce(Val);
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -681,7 +706,8 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::OrReduction;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -848,6 +874,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::OrReduction:
+    O << "or reduction";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..878a0b9617fd3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -690,7 +690,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
 
   Type *IdxTy =
       Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType();
-  const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE);
+  const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE, nullptr);
   ScalarEvolution &SE = *PSE.getSE();
   ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
   const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
new file mode 100644
index 0000000000000..6cc49ab55078c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -0,0 +1,2829 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -mattr=+sve \
+; RUN:    -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,MAY_FAULT
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -vectorizer-no-mem-fault \
+; RUN:    -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,NO_FAULT
+
+declare void @init_mem(ptr, i64);
+
+define i64 @same_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP33]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_gep_two_indices(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP32:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP36]])
+; CHECK-NEXT:    br i1 [[TMP32]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP36]], i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP34]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP35]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds [1024 x i8], ptr %p1, i64 0, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds [1024 x i8], ptr %p2, i64 0, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_alloca_diff_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [40 x i32], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [40 x i32], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP33]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [40 x i32]
+  %p2 = alloca [40 x i32]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use2() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 16 x i1> [[TMP11]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP12]])
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 67, %loop ], [ %index, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @same_exit_block_pre_inc_use3() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use3(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE5:%.*]] = add i64 3, [[TMP38]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    [[CMO6:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE7:%.*]] = add i64 3, [[CMO6]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ], [ [[IND_EARLY_ESCAPE5]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_ESCAPE7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  ret i64 %index
+}
+
+
+; In this example the early exit block appears in the list of ExitNotTaken
+; SCEVs, but is not computable.
+define i64 @same_exit_block_pre_inc_use4() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP11]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC4]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = mul <vscale x 2 x i64> [[TMP14]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT3]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <vscale x 2 x i1> [[TMP21]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP22]])
+; CHECK-NEXT:    br i1 [[TMP23]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]]
+; CHECK:       loop.inc4:
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP22]], i1 true)
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[TMP25]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP26]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i64]
+  %p2 = alloca [1024 x i64]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index
+  %ld1 = load i64, ptr %arrayidx, align 1
+  %cmp3 = icmp ult i64 %index, %ld1
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+
+define i64 @same_exit_block_post_inc_use() {
+; CHECK-LABEL: define i64 @same_exit_block_post_inc_use(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP11]], 1
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ %index.next, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @same_exit_block_post_inc_use2() {
+; CHECK-LABEL: define i64 @same_exit_block_post_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 16 x i1> [[TMP13]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP14]])
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP14]], i1 true)
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[INDEX1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 1
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP19]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %index.next = add i64 %index, 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index.next, %loop ], [ %index, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @same_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 16 x i1> [[TMP11]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP12]])
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @diff_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ 67, %loop.inc ]
+  ret i64 %retval2
+}
+
+define i64 @diff_exit_block_pre_inc_use2() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 16 x i1> [[TMP11]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP12]])
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ 67, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index, %loop.inc ]
+  ret i64 %retval2
+}
+
+define i64 @diff_exit_block_pre_inc_use3() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use3(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]]
+; CHECK:       loop.inc4:
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX2]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA1]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  ret i64 %index
+
+loop.end:
+  ret i64 %index
+}
+
+
+define i64 @diff_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 16 x i1> [[TMP11]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP12]])
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    ret i64 0
+; CHECK:       loop.end:
+; CHECK-NEXT:    ret i64 1
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  ret i64 0
+
+loop.end:
+  ret i64 1
+}
+
+
+define i64 @diff_exit_block_post_inc_use1() {
+; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP11]], 1
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index.next, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+define i64 @diff_exit_block_post_inc_use2() {
+; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 16 x i1> [[TMP13]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP14]])
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP14]], i1 true)
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[INDEX1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 1
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP19]]
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index.next, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't
+; support this yet.
+define i64 @early_exit_on_last_block() {
+; CHECK-LABEL: define i64 @early_exit_on_last_block(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %search ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ]
+  ret i64 %retval
+}
+
+
+; There are multiple exit blocks - two of them have an exact representation for the
+; exit-not-taken counts and the other is unknown, i.e. the "early exit".
+define i64 @multiple_exits_one_early() {
+; CHECK-LABEL: define i64 @multiple_exits_one_early(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 62, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 62, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 62, [[TMP6]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; CHECK-NEXT:    br i1 [[TMP22]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP21]], i1 true)
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ]
+  ret i64 %retval
+}
+
+
+; We don't currently support multiple early exits.
+define i64 @multiple_early_exits() {
+; CHECK-LABEL: define i64 @multiple_early_exits(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       search1:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       search2:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 [[TMP41]], 34
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_END_LOOPEXIT]], label [[FOR_INC1]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 100, [[FOR_INC]] ], [ 43, [[FOR_INC1]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %search1
+
+search1:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp1 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp1, label %loop.end, label %search2
+
+search2:
+  %cmp2 = icmp ult i8 %ld1, 34
+  br i1 %cmp2, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %search1, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @early_exit_infinite_loop() {
+; CHECK-LABEL: define i64 @early_exit_infinite_loop(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br label [[LAND_RHS]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br label %loop
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use_inv_cond(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[COND]], <vscale x 16 x i1> [[TMP31]], <vscale x 16 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false
+; CHECK-NEXT:    br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  %cmp4 = select i1 %cond, i1 %cmp3, i1 false
+  br i1 %cmp4, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_reverse() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 1023, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 -1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], -2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], -5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], -7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], -8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], -9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], -10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], -11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], -12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], -15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 0, [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = sub i64 1, [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 [[TMP31]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP33]], align 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 16
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 0, [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = sub i64 1, [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i64 [[TMP38]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP40]], align 1
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <vscale x 16 x i8> [[REVERSE]], [[REVERSE3]]
+; CHECK-NEXT:    [[TMP42:%.*]] = xor <vscale x 16 x i1> [[TMP41]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP43:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP42]])
+; CHECK-NEXT:    br i1 [[TMP43]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]]
+; CHECK:       loop.inc4:
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP45:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP42]], i1 true)
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = sub i64 1023, [[TMP46]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], -1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 1024, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 1023, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, -1
+  %exitcond = icmp eq i64 %index.next, 0
+  br i1 %exitcond, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 1024, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_with_reduction() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ]
+; CHECK-NEXT:    [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %ld2.zext = zext i8 %ld2 to i64
+  %red.next = add i64 %red, %ld2.zext
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  %retval = add i64 %red.next, %final.ind
+  ret i64 %retval
+}
+
+
+; TODO: We should really be able to vectorise this by detecting the loads are
+; unconditionally dereferenceable, but that requires work in ScalarEvolution's
+; getRangeRef to teach it about the dereferenceable attribute.
+define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
+; MAY_FAULT-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    br label [[LOOP:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[RETVAL]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
+; NO_FAULT-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[LOOP_END:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LOOP:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP39:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas(
+; MAY_FAULT-SAME: ) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    [[P1:%.*]] = alloca [42 x i8], align 4
+; MAY_FAULT-NEXT:    [[P2:%.*]] = alloca [42 x i8], align 4
+; MAY_FAULT-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; MAY_FAULT-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; MAY_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP1]], [[TMP2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas(
+; NO_FAULT-SAME: ) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[P1:%.*]] = alloca [42 x i8], align 4
+; NO_FAULT-NEXT:    [[P2:%.*]] = alloca [42 x i8], align 4
+; NO_FAULT-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; NO_FAULT-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP1]], 16
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP39]])
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP41:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [42 x i8]
+  %p2 = alloca [42 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; MAY_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP1]], [[TMP2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; NO_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP1]], 16
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP39]])
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; MAY_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP1]], [[TMP2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; NO_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP1]], 16
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP39]])
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP45:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+;.
+; MAY_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MAY_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MAY_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MAY_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]}
+;.
+; NO_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP40]] = distinct !{[[LOOP40]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP41]] = distinct !{[[LOOP41]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP42]] = distinct !{[[LOOP42]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP43]] = distinct !{[[LOOP43]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP44]] = distinct !{[[LOOP44]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP45]] = distinct !{[[LOOP45]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index ac33f6e3e6f72..f903ea935e5cc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -44,7 +44,17 @@
 ; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
 ; CHECK: remark: source.cpp:27:3: loop not vectorized
 
-; YAML:       --- !Analysis
+
+; YAML: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            EarlyExitNotLatchPredecessor
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 5, Column: 9 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          Cannot vectorize early exit loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            CantComputeNumberOfIterations
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 5, Column: 9 }
@@ -117,6 +127,15 @@
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            EarlyExitNotLatchPredecessor
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          Cannot vectorize early exit loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            CantComputeNumberOfIterations
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
 ; YAML-NEXT: Function:        test_multiple_failures
diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll
index a27f2f0841bca..efcc24b3894a2 100644
--- a/llvm/test/Transforms/LoopVectorize/control-flow.ll
+++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll
@@ -10,7 +10,7 @@
 ;   return 0;
 ; }
 
-; CHECK: remark: source.cpp:5:9: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop
 ; CHECK: remark: source.cpp:5:9: loop not vectorized
 
 ; CHECK: _Z4testPii