[llvm] [LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (PR #96752)

Tue Aug 20 06:30:08 PDT 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/96752

>From 22c593d2e2e38bb10898d1975fc5444ff91c1dd7 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 23 Jul 2024 07:34:07 +0000
Subject: [PATCH 1/2] [LoopVectorize] Add tests for dereferenceable loads in
 more loops

* Adds tests for strided accesses.
* Adds tests for reverse loops.

As part of this I've moved one of the negative tests from
load-deref-pred-align.ll into a new file
(load-deref-pred-neg-off.ll) because the pointer type had a
size of 16 bits and I realised it's probably not sensible for
allocas that are >16 bits in size!

I've also tweaked
scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
to use an alloca for the array access, rather than a global since
this more reliably leads to generation of dereferenceable loads.
---
 ...demanding-all-lanes-and-first-lane-only.ll |  38 +-
 .../LoopVectorize/load-deref-pred-align.ll    | 428 +++++++++++++++++-
 .../LoopVectorize/load-deref-pred-neg-off.ll  | 104 +++++
 3 files changed, 552 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll

diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fcf1ba072a62cb..00081a42151f6a 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
- at src = external global [8 x i32], align 4
+declare void @init()
 
 ; Test case where scalar steps are used by both a VPReplicateRecipe (demands
 ; all scalar lanes) and a VPInstruction that only demands the first lane.
@@ -12,34 +12,36 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-LABEL: define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(
 ; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC_1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [8 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[SRC]])
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[L_1]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr [[SRC]], i64 0, i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
@@ -78,26 +80,28 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L_1]], 0
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[MUL_IV1:%.*]] = mul nsw i64 [[IV1]], 4
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV1]]
+; CHECK-NEXT:    [[L_3:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L_3]], 0
 ; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[IV_OR:%.*]] = or disjoint i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 [[IV_OR]]
+; CHECK-NEXT:    [[IV_OR:%.*]] = or disjoint i64 [[IV1]], 4
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [8 x i32], ptr [[SRC]], i64 0, i64 [[IV_OR]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    store i32 [[L_2]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %src = alloca [8 x i32], align 4
+  call void @init(ptr %src)
   br label %loop.header
 
 loop.header:
@@ -110,7 +114,7 @@ loop.header:
 
 then:
   %iv.or = or disjoint i64 %iv, 4
-  %gep.src = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 %iv.or
+  %gep.src = getelementptr inbounds [8 x i32], ptr %src, i64 0, i64 %iv.or
   %l.2 = load i32, ptr %gep.src, align 4
   store i32 %l.2, ptr %dst, align 4
   br label %loop.latch
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index a7c9a18127ade5..344181bfa0d1e0 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
 declare void @init(ptr nocapture nofree)
 
@@ -296,3 +296,429 @@ latch:
 loop_exit:
   ret i8 %accum.next
 }
+
+; Test reverse loops where we can prove loads in predicated blocks are safe
+; to load unconditionally.
+define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i32 [[TMP8]], 2
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nsw i32 [[TMP14]], 2
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP18]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP19]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) [[DEST:%.*]], ptr noundef nonnull align 4 dereferenceable(1024) [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %indvars.iv
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp2.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) %dest, ptr noundef nonnull align 4 dereferenceable(1024) %local_dest, i64 1024, i1 false)
+  ret void
+}
+
+
+; Test reverse loops where we *cannot* prove loads in predicated blocks are safe
+; to load unconditionally.
+define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_non_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1023, i64 1022>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], <i64 -1, i64 -1>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP5]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw i32 [[TMP10]], 2
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 -2, i64 -2>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[OFF:%.*]] = add i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP22]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP23]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) [[DEST:%.*]], ptr noundef nonnull align 4 dereferenceable(1024) [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.inc ]
+  %off = add i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %off
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %off
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %off
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp2.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) %dest, ptr noundef nonnull align 4 dereferenceable(1024) %local_dest, i64 1024, i1 false)
+  ret void
+}
+
+
+; Test a loop with a positive step recurrence that has a strided access
+define i16 @test_strided_access(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @test_strided_access(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [163840 x i16], align 4
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[IV_STRIDE:%.*]] = mul i64 [[IV]], 2
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV_STRIDE]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [163840 x i16], align 4
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %accum = phi i16 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i64 %iv, 1
+  %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %iv
+  %l.t = load i8, ptr %test_addr
+  %cmp = icmp sge i8 %l.t, 0
+  br i1 %cmp, label %pred, label %latch
+pred:
+  %iv.stride = mul i64 %iv, 2
+  %addr = getelementptr inbounds i16, ptr %alloca, i64 %iv.stride
+  %val = load i16, ptr %addr, align 2
+  br label %latch
+latch:
+  %val.phi = phi i16 [0, %loop], [%val, %pred]
+  %accum.next = add i16 %accum, %val.phi
+  %exit = icmp eq i64 %iv, 4095
+  br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+  ret i16 %accum.next
+}
+
+
+; Test a loop with a negative step recurrence that has a strided access
+define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_strided_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 511, i64 510>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 511, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP10]], 2
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 -2, i64 -2>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 511, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INDVARS_IV_STRIDED:%.*]] = mul i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV_STRIDED]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP22]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) [[DEST:%.*]], ptr noundef nonnull align 4 dereferenceable(1024) [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 511, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %indvars.iv.strided = mul i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %indvars.iv.strided
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %indvars.iv
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp2.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(1024) %dest, ptr noundef nonnull align 4 dereferenceable(1024) %local_dest, i64 1024, i1 false)
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
new file mode 100644
index 00000000000000..1dd526df503bd1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+
+declare void @init(ptr nocapture nofree)
+
+
+; Test where offset relative to alloca is negative and we shouldn't
+; treat predicated loads as being always dereferenceable.
+define i8 @test_negative_off(i16 %len, ptr %test_base) {
+; CHECK-LABEL: @test_negative_off(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [64638 x i8]
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
+  %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i16 %iv, 1
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
+  %earlycnd = load i1, ptr %test_addr
+  br i1 %earlycnd, label %pred, label %latch
+pred:
+  %addr = getelementptr i8, ptr %alloca, i16 %iv
+  %val = load i8, ptr %addr
+  br label %latch
+latch:
+  %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
+  %accum.next = add i8 %accum, %val.phi
+  %exit = icmp ugt i16 %iv, -990
+  br i1 %exit, label %loop_exit, label %loop
+loop_exit:
+  ret i8 %accum.next
+}

>From 3294f86a3193a9dc6e583c96948ff0b0df5235fd Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 23 Jul 2024 07:34:40 +0000
Subject: [PATCH 2/2] [LoopVectorize] Add support for reverse loops in
 isDereferenceableAndAlignedInLoop

Currently when we encounter a negative step in the induction
variable isDereferenceableAndAlignedInLoop bails out because
the element size is signed greater than the step. This patch
adds support for negative steps in cases where we detect the
start address for the load is of the form base + offset. In
this case the address decrements in each iteration so we need
to calculate the access size differently.

The motivation for this patch comes from PR #88385 where a
reviewer requested reusing isDereferenceableAndAlignedInLoop,
but that PR itself does support reverse loops.
---
 llvm/lib/Analysis/Loads.cpp                   | 69 ++++++++++-----
 .../LoopVectorize/load-deref-pred-align.ll    | 85 ++++++++++---------
 2 files changed, 94 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index a88469ab81a8c8..759d3d0d8d27e0 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -293,7 +293,15 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
 
   // TODO: Handle overlapping accesses.
   // We should be computing AccessSize as (TC - 1) * Step + EltSize.
-  if (EltSize.sgt(Step->getAPInt()))
+  bool StepIsNegative = Step->getAPInt().isNegative();
+  APInt AbsStep = Step->getAPInt().abs();
+  if (EltSize.ugt(AbsStep))
+    return false;
+
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  if (EltSize.urem(Alignment.value()) != 0)
     return false;
 
   // Compute the total access size for access patterns with unit stride and
@@ -301,18 +309,49 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
   // same.
   // For patterns with gaps (i.e. non unit stride), we are
   // accessing EltSize bytes at every Step.
-  APInt AccessSize = TC * Step->getAPInt();
+  APInt AccessSize = TC * AbsStep;
 
   assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
          "implied by addrec definition");
   Value *Base = nullptr;
   if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
+    if (StepIsNegative)
+      return false;
     Base = StartS->getValue();
   } else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
-    // Handle (NewBase + offset) as start value.
-    const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
-    const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
-    if (StartS->getNumOperands() == 2 && Offset && NewBase) {
+    const SCEV *End = AddRec->evaluateAtIteration(
+        SE.getConstant(StartS->getType(), TC - 1), SE);
+
+    // The step recurrence could be negative so it's necessary to find the min
+    // and max accessed addresses in the loop.
+    const SCEV *Min = SE.getUMinExpr(StartS, End);
+    const SCEV *Max = SE.getUMaxExpr(StartS, End);
+    if (isa<SCEVCouldNotCompute>(Min) || isa<SCEVCouldNotCompute>(Max))
+      return false;
+
+    // Now calculate the total access size, which is (max - min) + element_size.
+    const SCEV *Diff = SE.getMinusSCEV(Max, Min);
+    if (isa<SCEVCouldNotCompute>(Diff))
+      return false;
+
+    const SCEV *AS = SE.getAddExpr(
+        Diff, SE.getConstant(Diff->getType(), EltSize.getZExtValue()));
+    auto *ASC = dyn_cast<SCEVConstant>(AS);
+    if (!ASC)
+      return false;
+
+    if (const SCEVUnknown *NewBase = dyn_cast<SCEVUnknown>(Min)) {
+      Base = NewBase->getValue();
+      AccessSize = ASC->getAPInt();
+    } else if (auto *MinAddRec = dyn_cast<SCEVAddExpr>(Min)) {
+      if (MinAddRec->getNumOperands() != 2)
+        return false;
+
+      const auto *Offset = dyn_cast<SCEVConstant>(MinAddRec->getOperand(0));
+      const auto *NewBase = dyn_cast<SCEVUnknown>(MinAddRec->getOperand(1));
+      if (!Offset || !NewBase)
+        return false;
+
       // The following code below assumes the offset is unsigned, but GEP
       // offsets are treated as signed so we can end up with a signed value
       // here too. For example, suppose the initial PHI value is (i8 255),
@@ -325,22 +364,14 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
       // TODO: generalize if a case found which warrants
       if (Offset->getAPInt().urem(Alignment.value()) != 0)
         return false;
-      Base = NewBase->getValue();
-      bool Overflow = false;
-      AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
-      if (Overflow)
-        return false;
-    }
-  }
 
-  if (!Base)
+      AccessSize = ASC->getAPInt() + Offset->getAPInt();
+      Base = NewBase->getValue();
+    } else
+      return false;
+  } else
     return false;
 
-  // For the moment, restrict ourselves to the case where the access size is a
-  // multiple of the requested alignment and the base is aligned.
-  // TODO: generalize if a case found which warrants
-  if (EltSize.urem(Alignment.value()) != 0)
-    return false;
   return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
                                             HeaderFirstNonPHI, AC, &DT);
 }
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 344181bfa0d1e0..f4a8777bf1487a 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -311,7 +311,7 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
@@ -321,30 +321,33 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i32 [[TMP8]], 2
-; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nsw i32 [[TMP14]], 2
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -353,13 +356,13 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP18]], 3
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3
 ; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
@@ -635,26 +638,26 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP10]], 2
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2: