[llvm] LV: clamp VF with TC only when scalar epilogue is needed (PR #91253)
Nikolay Panchenko via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 16 07:52:16 PDT 2024
================
@@ -252,36 +252,46 @@ for.end: ; preds = %for.body
define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip16_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 8 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 8 x i8> [[TMP9]], [[WIDE_LOAD1]]
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP12]], ptr [[TMP11]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
----------------
npanchen wrote:
On RISC-V this minimum iteration check is going to be expensive, since register's size is unknown. For instance, that's what is generated before that change:
```
trip16_i8: # @trip16_i8
# %bb.0: # %entry
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (a1)
vle8.v v9, (a0)
vadd.vv v8, v8, v8
vadd.vv v8, v8, v9
vse8.v v8, (a0)
ret
```
after that change
```
.type trip16_i8, at function
trip16_i8:# @trip16_i8
# %bb.0: # %entry
addi sp, sp, -32
sd ra, 24(sp) # 8-byte Folded Spill
sd s0, 16(sp) # 8-byte Folded Spill
sd s1, 8(sp) # 8-byte Folded Spill
csrr a2, vlenb
srli a2, a2, 3
li a3, 2
mv s1, a1
mv s0, a0
bgeu a3, a2, .LBB4_2
# %bb.1:
li a1, 0
j .LBB4_3
.LBB4_2: # %vector.ph
csrr a0, vlenb
li a1, 3
call __muldi3
vl1r.v v8, (s1)
vl1r.v v9, (s0)
andi a1, a0, 16
vsetvli a0, zero, e8, m1, ta, ma
vadd.vv v8, v8, v8
vadd.vv v8, v8, v9
vs1r.v v8, (s0)
bnez a1, .LBB4_5
.LBB4_3: # %for.body.preheader
add a0, s0, a1
add s1, s1, a1
addi s0, s0, 16
.LBB4_4: # %for.body
# =>This Inner Loop Header: Depth=1
lbu a1, 0(s1)
lbu a2, 0(a0)
slli a1, a1, 1
add a1, a1, a2
sb a1, 0(a0)
addi a0, a0, 1
addi s1, s1, 1
bne a0, s0, .LBB4_4
.LBB4_5: # %for.end
ld ra, 24(sp) # 8-byte Folded Reload
ld s0, 16(sp) # 8-byte Folded Reload
ld s1, 8(sp) # 8-byte Folded Reload
addi sp, sp, 32
ret
```
However, long term plan is to enable EVL vectorization for RISC-V. As it doesn't require this extra check, generated code will be good:
```
trip16_i8: # @trip16_i8
# %bb.0: # %entry
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (a1)
vle8.v v9, (a0)
vadd.vv v8, v8, v8
vadd.vv v8, v8, v9
vse8.v v8, (a0)
ret
```
https://github.com/llvm/llvm-project/pull/91253
More information about the llvm-commits
mailing list