[llvm] [LoopVectorize] Add support for vectorisation of simple early exit loops (PR #88385)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Thu May 9 01:01:24 PDT 2024
================
@@ -0,0 +1,2660 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -mattr=+sve \
+; RUN: -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,MAY_FAULT
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -vectorizer-no-mem-fault \
+; RUN: -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,NO_FAULT
+
+
+define i64 @same_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT: br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LAND_RHS:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK: loop.inc:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.end:
+; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i64 [[START_0_LCSSA]]
+;
+entry:
+ %p1 = alloca [1024 x i8]
+ %p2 = alloca [1024 x i8]
----------------
david-arm wrote:
Yeah sure I'll try to come up with a different title. I am aware the vectoriser already supports some simpler early exit cases, which I've made an effort to address in the patch in terms of functionality and terminology. In fact, this patch permits multiple early exits in a loop, provided only one of them has an unknown exit-not-taken count. In such cases a scalar epilogue is still required. Essentially we have to distinguish between loops with countable early exits and non-countable ones.
https://github.com/llvm/llvm-project/pull/88385
More information about the llvm-commits
mailing list