[llvm] [LV] Transform to handle exits in the scalar loop (PR #148626)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 04:19:04 PST 2025
================
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -S < %s -p loop-vectorize -handle-early-exits-in-scalar-tail -force-vector-width=4 | FileCheck %s
+
+define i32 @simple_contains(ptr align 4 dereferenceable(100) readonly %array, i32 %elt) {
+; CHECK-LABEL: define i32 @simple_contains(
+; CHECK-SAME: ptr readonly align 4 dereferenceable(100) [[ARRAY:%.*]], i32 [[ELT:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ELT]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAY]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i1> [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT: br i1 [[TMP2]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH_SPLIT:.*]]
+; CHECK: [[VECTOR_PH_SPLIT]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH_SPLIT]] ], [ [[IV:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT: [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IV]], i64 24)
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LD_ADDR]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV]], 24
+; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[IV]], 25
+; CHECK-NEXT: br i1 [[TMP9]], label %[[NOT_FOUND:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IV]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[LD_ADDR1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV1]]
+; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR1]], align 4
+; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[LD]], [[ELT]]
+; CHECK-NEXT: br i1 [[CMP_EARLY]], label %[[FOUND:.*]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 25
+; CHECK-NEXT: br i1 [[CMP]], label %[[NOT_FOUND]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOUND]]:
+; CHECK-NEXT: ret i32 1
+; CHECK: [[NOT_FOUND]]:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+ %ld.addr = getelementptr inbounds i32, ptr %array, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %cmp.early = icmp eq i32 %ld, %elt
+ br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+ %iv.next = add nsw nuw i64 %iv, 1
+ %cmp = icmp eq i64 %iv.next, 25
+ br i1 %cmp, label %not.found, label %for.body
+
+found:
+ ret i32 1
+
+not.found:
+ ret i32 0
+}
+
+define i32 @contains_with_variable_tc(ptr readonly %array, i8 %elt, i64 %n) nofree nosync {
+; CHECK-LABEL: define i32 @contains_with_variable_tc(
+; CHECK-SAME: ptr readonly [[ARRAY:%.*]], i8 [[ELT:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAY]], i64 1), "dereferenceable"(ptr [[ARRAY]], i64 [[N]]) ]
+; CHECK-NEXT: [[ZERO_TC:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT: br i1 [[ZERO_TC]], label %[[NOT_FOUND:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[ITERS:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[ELT]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[ARRAY]], align 1
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i1> [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT: br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH_SPLIT:.*]]
+; CHECK: [[VECTOR_PH_SPLIT]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 [[IV_NEXT]]
+; CHECK-NEXT: [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IV_NEXT]], i64 [[ITERS]])
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP3]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i8> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERS]]
+; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[CMP]]
+; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[NOT_FOUND_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IV_NEXT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], %[[FOR_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 1
+; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i8 [[LD]], [[ELT]]
+; CHECK-NEXT: br i1 [[CMP_EARLY]], label %[[FOUND:.*]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[NOT_FOUND_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[FOUND]]:
+; CHECK-NEXT: ret i32 1
+; CHECK: [[NOT_FOUND_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[NOT_FOUND]]
+; CHECK: [[NOT_FOUND]]:
+; CHECK-NEXT: ret i32 0
+;
+
+entry:
+ call void @llvm.assume(i1 true) [ "align"(ptr %array, i64 1), "dereferenceable"(ptr %array, i64 %n) ]
+ %zero.tc = icmp eq i64 %n, 0
+ br i1 %zero.tc, label %not.found, label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+ %ld.addr = getelementptr inbounds i8, ptr %array, i64 %iv
+ %ld = load i8, ptr %ld.addr
+ %cmp.early = icmp eq i8 %ld, %elt
+ br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+ %iv.next = add nsw nuw i64 %iv, 1
+ %cmp = icmp eq i64 %iv.next, %n
+ br i1 %cmp, label %not.found, label %for.body
+
+found:
+ ret i32 1
+
+not.found:
+ ret i32 0
+}
----------------
huntergr-arm wrote:
done
https://github.com/llvm/llvm-project/pull/148626
More information about the llvm-commits
mailing list