[llvm] [LV][VPlan] Add initial support for CSA vectorization (PR #106560)
Michael Maitland via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 14 06:15:12 PDT 2024
================
@@ -0,0 +1,2266 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN: -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -enable-csa-vectorization | FileCheck %s -check-prefixes=CHECK,EVL
+; RUN: opt < %s -S -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -force-tail-folding-style=none -enable-csa-vectorization \
+; RUN: | FileCheck %s -check-prefixes=CHECK,NO-EVL
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data, int a) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
+; EVL-LABEL: define i32 @simple_csa_int_select(
+; EVL-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; EVL-NEXT: [[ENTRY:.*]]:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; EVL-NEXT: br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; EVL: [[LOOP_PREHEADER]]:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL: [[VECTOR_PH]]:
+; EVL-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP17]], 1
+; EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP17]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A]], i64 0
+; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; EVL: [[VECTOR_BODY]]:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, %[[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_VL_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[CSA_VL_SEL:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[AVL:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[EVL_BASED_IV]]
+; EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; EVL-NEXT: [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT: [[CSA_COND_ANYACTIVE:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; EVL-NEXT: [[CSA_VL_SEL]] = select i1 [[CSA_COND_ANYACTIVE]], i32 [[TMP5]], i32 [[CSA_VL_PHI]]
----------------
michaelmaitland wrote:
This is the generated code:
```
# %bb.0: # %entry
sext.w a3, a0
blez a3, .LBB0_6
# %bb.1: # %loop.preheader
li a3, 0
li a6, 0
slli a0, a0, 32
srli a0, a0, 32
and a5, a5, a7
vsetvli a7, zero, e8, mf2, ta, ma
vmclr.m v8
j .LBB0_3
.LBB0_2: # %vector.body
snez a6, t1
vsetvli t1, zero, e8, mf2, ta, ma
vmv.v.x v14, a6
vmsne.vi v0, v14, 0
vmand.mm v9, v9, v0
vmandn.mm v8, v8, v0
vmor.mm v8, v9, v8
vsetvli zero, zero, e32, m2, ta, ma
vmerge.vvm v10, v10, v12, v0
sub a5, a5, a4
add a3, a7, a3
mv a6, t0
beqz a5, .LBB0_5
.LBB0_3: # %vector.body
sub t0, a0, a3
vsetvli a7, t0, e8, mf2, ta, ma
slli t1, a3, 2
add t1, a1, t1
vle32.v v12, (t1)
vsetvli t1, zero, e64, m4, ta, ma
vsext.vf2 v16, v12
vmsgt.vx v9, v16, a2
vsetvli zero, t0, e8, mf2, ta, ma
vcpop.m t1, v9
mv t0, a7
bnez t1, .LBB0_2
# %bb.4: # %vector.body
mv t0, a6
j .LBB0_2
.LBB0_5: # %middle.block
slli t0, t0, 32
srli a0, t0, 32
vsetvli zero, a0, e32, m2, ta, ma
vid.v v12
vsetivli zero, 1, e32, m1, ta, ma
vmv.v.i v9, -1
vmv1r.v v0, v8
vsetvli zero, a0, e32, m2, ta, ma
vredmax.vs v9, v12, v9, v0.t
vmv.x.s a0, v9
slli a1, a0, 32
srli a1, a1, 32
vsetivli zero, 1, e32, m2, ta, ma
vslidedown.vx v8, v10, a1
vmv.x.s a1, v8
srai a0, a0, 63
or a0, a0, a1
ret
.LBB0_6:
li a0, -1
ret
```
https://github.com/llvm/llvm-project/pull/106560
More information about the llvm-commits
mailing list