[llvm] 8075445 - [LoopVectorize] Add tests for dereferenceable loads in more loops (#118470)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 04:41:33 PST 2024
Author: David Sherwood
Date: 2024-12-03T12:41:30Z
New Revision: 807544561310d49b51915f365b7521412d68c219
URL: https://github.com/llvm/llvm-project/commit/807544561310d49b51915f365b7521412d68c219
DIFF: https://github.com/llvm/llvm-project/commit/807544561310d49b51915f365b7521412d68c219.diff
LOG: [LoopVectorize] Add tests for dereferenceable loads in more loops (#118470)
* Adds tests for strided accesses.
* Adds tests for reverse loops.
As part of this I've moved one of the negative tests from
load-deref-pred-align.ll into a new file
(load-deref-pred-neg-off.ll) because the pointer type had a
size of 16 bits and I realised it's probably not sensible for
allocas that are >16 bits in size!
Added:
llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
Modified:
llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 1ef01e3b793d5b..bf22d63850835d 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
declare void @init(ptr nocapture nofree)
@@ -200,104 +200,6 @@ loop_exit:
}
-; Test where offset relative to alloca is negative and we shouldn't
-; treat predicated loads as being always dereferenceable.
-define i8 @test_negative_off(i16 %len, ptr %test_base) {
-; CHECK-LABEL: @test_negative_off(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
-; CHECK-NEXT: call void @init(ptr [[ALLOCA]])
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
-; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK: pred.load.if1:
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
-; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK: pred.load.continue2:
-; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
-; CHECK-NEXT: [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
-; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT: [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
-; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK: pred:
-; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
-; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
-; CHECK-NEXT: br label [[LATCH]]
-; CHECK: latch:
-; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT: [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
-; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: loop_exit:
-; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i8 [[ACCUM_NEXT_LCSSA]]
-;
-entry:
- %alloca = alloca [64638 x i8]
- call void @init(ptr %alloca)
- br label %loop
-loop:
- %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
- %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
- %iv.next = add i16 %iv, 1
- %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
- %earlycnd = load i1, ptr %test_addr
- br i1 %earlycnd, label %pred, label %latch
-pred:
- %addr = getelementptr i8, ptr %alloca, i16 %iv
- %val = load i8, ptr %addr
- br label %latch
-latch:
- %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
- %accum.next = add i8 %accum, %val.phi
- %exit = icmp ugt i16 %iv, -990
- br i1 %exit, label %loop_exit, label %loop
-loop_exit:
- ret i8 %accum.next
-}
-
-
define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) {
; CHECK-LABEL: @loop_requires_scev_predicate(
; CHECK-NEXT: entry:
@@ -423,3 +325,430 @@ for.inc:
exit:
ret i32 0
}
+
+
+; Test reverse loops where we should be able to prove loads in predicated blocks
+; are safe to load unconditionally.
+define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_deref_loads(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT: call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
+; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; CHECK: pred.store.if1:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2
+; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; CHECK: pred.store.continue2:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]]
+; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3
+; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV]]
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]]
+; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT: br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT: ret void
+;
+entry:
+ %local_dest = alloca [1024 x i32], align 4
+ %local_src = alloca [1024 x i32], align 4
+ %local_cmp = alloca [1024 x i32], align 4
+ call void @init(ptr %local_src)
+ call void @init(ptr %local_cmp)
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.inc ]
+ %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp3.not = icmp eq i32 %0, 3
+ br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+ %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %iv
+ %1 = load i32, ptr %arrayidx5, align 4
+ %mul = shl nsw i32 %1, 2
+ %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %iv
+ store i32 %mul, ptr %arrayidx7, align 4
+ br label %for.inc
+
+for.inc:
+ %iv.next = add nsw i64 %iv, -1
+ %cmp2.not = icmp eq i64 %iv, 0
+ br i1 %cmp2.not, label %exit, label %for.body
+
+exit:
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false)
+ ret void
+}
+
+
+; Test reverse loops where we *cannot* prove loads in predicated blocks are safe
+; to load unconditionally.
+define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_non_deref_loads(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT: call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1023, i64 1022>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 -1)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -1
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
+; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = shl nsw i32 [[TMP10]], 2
+; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP12]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK: pred.store.if1:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP19]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
+; CHECK: pred.store.continue2:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: [[OFF:%.*]] = add i64 [[IV]], -1
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]]
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP22]], 3
+; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[OFF]]
+; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP23]], 2
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[OFF]]
+; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT: br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT: ret void
+;
+entry:
+ %local_dest = alloca [1024 x i32], align 4
+ %local_src = alloca [1024 x i32], align 4
+ %local_cmp = alloca [1024 x i32], align 4
+ call void @init(ptr %local_src)
+ call void @init(ptr %local_cmp)
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.inc ]
+ %off = add i64 %iv, -1
+ %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %off
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp3.not = icmp eq i32 %0, 3
+ br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+ %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %off
+ %1 = load i32, ptr %arrayidx5, align 4
+ %mul = shl nsw i32 %1, 2
+ %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %off
+ store i32 %mul, ptr %arrayidx7, align 4
+ br label %for.inc
+
+for.inc:
+ %iv.next = add nsw i64 %iv, -1
+ %cmp2.not = icmp eq i64 %iv, 0
+ br i1 %cmp2.not, label %exit, label %for.body
+
+exit:
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false)
+ ret void
+}
+
+
+; Test a loop with a positive step recurrence that has a strided access
+define i16 @test_strided_access(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @test_strided_access(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [163840 x i16], align 4
+; CHECK-NEXT: call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer
+; CHECK-NEXT: [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]])
+; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT: [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT: [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK: pred:
+; CHECK-NEXT: [[IV_STRIDE:%.*]] = mul i64 [[IV]], 2
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV_STRIDE]]
+; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; CHECK-NEXT: br label [[LATCH]]
+; CHECK: latch:
+; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT: [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
+; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK: loop_exit:
+; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i16 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+ %alloca = alloca [163840 x i16], align 4
+ call void @init(ptr %alloca)
+ br label %loop
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %accum = phi i16 [ 0, %entry ], [ %accum.next, %latch ]
+ %iv.next = add i64 %iv, 1
+ %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %iv
+ %l.t = load i8, ptr %test_addr
+ %cmp = icmp sge i8 %l.t, 0
+ br i1 %cmp, label %pred, label %latch
+pred:
+ %iv.stride = mul i64 %iv, 2
+ %addr = getelementptr inbounds i16, ptr %alloca, i64 %iv.stride
+ %val = load i16, ptr %addr, align 2
+ br label %latch
+latch:
+ %val.phi = phi i16 [0, %loop], [%val, %pred]
+ %accum.next = add i16 %accum, %val.phi
+ %exit = icmp eq i64 %iv, 4095
+ br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+ ret i16 %accum.next
+}
+
+
+; Test a loop with a negative step recurrence that has a strided access
+define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_strided_deref_loads(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT: call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 511, i64 510>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 511, [[INDEX]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
+; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK: pred.store.if1:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2
+; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
+; CHECK: pred.store.continue2:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 511, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]]
+; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IV_STRIDED:%.*]] = mul i64 [[IV]], 2
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV_STRIDED]]
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP22]], 2
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]]
+; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT: br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT: ret void
+;
+entry:
+ %local_dest = alloca [1024 x i32], align 4
+ %local_src = alloca [1024 x i32], align 4
+ %local_cmp = alloca [1024 x i32], align 4
+ call void @init(ptr %local_src)
+ call void @init(ptr %local_cmp)
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 511, %entry ], [ %iv.next, %for.inc ]
+ %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp3.not = icmp eq i32 %0, 3
+ br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+ %iv.strided = mul i64 %iv, 2
+ %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %iv.strided
+ %1 = load i32, ptr %arrayidx5, align 4
+ %mul = shl nsw i32 %1, 2
+ %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %iv
+ store i32 %mul, ptr %arrayidx7, align 4
+ br label %for.inc
+
+for.inc:
+ %iv.next = add nsw i64 %iv, -1
+ %cmp2.not = icmp eq i64 %iv, 0
+ br i1 %cmp2.not, label %exit, label %for.body
+
+exit:
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false)
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
new file mode 100644
index 00000000000000..1dd526df503bd1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+
+declare void @init(ptr nocapture nofree)
+
+
+; Test where offset relative to alloca is negative and we shouldn't
+; treat predicated loads as being always dereferenceable.
+define i8 @test_negative_off(i16 %len, ptr %test_base) {
+; CHECK-LABEL: @test_negative_off(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
+; CHECK-NEXT: call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK: pred.load.if:
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
+; CHECK: pred.load.continue:
+; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK: pred.load.if1:
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
+; CHECK: pred.load.continue2:
+; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
+; CHECK-NEXT: [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
+; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT: [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
+; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK: pred:
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT: br label [[LATCH]]
+; CHECK: latch:
+; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT: [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
+; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop_exit:
+; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i8 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+ %alloca = alloca [64638 x i8]
+ call void @init(ptr %alloca)
+ br label %loop
+loop:
+ %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
+ %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
+ %iv.next = add i16 %iv, 1
+ %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
+ %earlycnd = load i1, ptr %test_addr
+ br i1 %earlycnd, label %pred, label %latch
+pred:
+ %addr = getelementptr i8, ptr %alloca, i16 %iv
+ %val = load i8, ptr %addr
+ br label %latch
+latch:
+ %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
+ %accum.next = add i8 %accum, %val.phi
+ %exit = icmp ugt i16 %iv, -990
+ br i1 %exit, label %loop_exit, label %loop
+loop_exit:
+ ret i8 %accum.next
+}
More information about the llvm-commits
mailing list