[llvm] [LoopVectorizer] Prune VFs based on plan register pressure (PR #132190)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 7 06:14:51 PDT 2025
https://github.com/lukel97 commented:
Thanks for working on this, this will improve things for RISC-V a lot. E.g. currently we spill a lot on this example due to needing a gather which requires i64 elements, which is only modelled in VPlan and not in the legacy model: https://godbolt.org/z/rrPssc5dx
With this patch we go from VF=vscale x 16 -> VF=vscale x 8, which fixes the spilling:
<details><summary>Before</summary>
<p>
```asm
.LBB0_11:
vfcvt.f.xu.v v8, v24
vfmul.vf v8, v8, fa0
vfcvt.rtz.x.f.v v8, v8
vsetivli zero, 16, e32, m8, ta, ma
vslidedown.vi v16, v8, 16
csrr s0, vlenb
slli s0, s0, 4
add s0, s0, sp
addi s0, s0, 16
vs8r.v v16, (s0)
vsetivli zero, 16, e64, m8, ta, ma
vmv.v.x v16, s6
csrr s0, vlenb
slli s0, s0, 3
sh1add s0, s0, s0
add s0, s0, sp
addi s0, s0, 16
vs8r.v v16, (s0)
csrr s0, vlenb
slli s0, s0, 3
sh1add s0, s0, s0
add s0, s0, sp
addi s0, s0, 16
vl8r.v v16, (s0)
vsetvli zero, zero, e32, m4, ta, ma
vwadd.wv v0, v16, v8
addi s0, sp, 16
vs8r.v v0, (s0)
csrr s0, vlenb
slli s0, s0, 3
sh1add s0, s0, s0
add s0, s0, sp
addi s0, s0, 16
vl8r.v v16, (s0)
csrr s0, vlenb
slli s0, s0, 4
add s0, s0, sp
addi s0, s0, 16
vl8r.v v8, (s0)
vwadd.wv v0, v16, v8
csrr s0, vlenb
sh3add s0, s0, sp
addi s0, s0, 16
vs8r.v v0, (s0)
addi s0, sp, 16
vl8r.v v8, (s0)
vsetvli zero, zero, e64, m8, ta, ma
vmul.vx v0, v8, t2
csrr s0, vlenb
sh3add s0, s0, sp
addi s0, s0, 16
vl8r.v v8, (s0)
vmul.vx v8, v8, t2
vsetvli zero, zero, e8, m1, ta, ma
vluxei64.v v16, (a4), v0
vsetvli zero, zero, e64, m8, ta, ma
vadd.vx v0, v0, a4
vsetvli zero, zero, e8, m1, ta, ma
vluxei64.v v18, (a4), v8
vsetvli zero, s5, e8, m2, ta, ma
vslideup.vi v16, v18, 16
vsetivli zero, 16, e8, m1, ta, ma
vluxei64.v v18, (t5), v0
vsetvli zero, zero, e64, m8, ta, ma
vadd.vx v8, v8, a4
vsetvli zero, zero, e8, m1, ta, ma
vluxei64.v v20, (t5), v8
vsetvli zero, s5, e8, m2, ta, ma
vslideup.vi v18, v20, 16
vsetivli zero, 16, e8, m1, ta, ma
vluxei64.v v20, (s4), v0
vluxei64.v v22, (s4), v8
vsetvli zero, s5, e8, m2, ta, ma
vslideup.vi v20, v22, 16
vsseg3e8.v v16, (s1)
vsetvli zero, zero, e32, m8, ta, ma
vadd.vx v24, v24, s5
addi a3, a3, -32
addi s1, s1, 96
bnez a3, .LBB0_11
mv s7, t3
beq a1, t3, .LBB0_3
j .LBB0_6
```
</p>
</details>
<details><summary>After</summary>
<p>
```asm
.LBB0_11: # %vector.body
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
vsetvli zero, zero, e32, m4, ta, ma
vfcvt.f.xu.v v24, v20
vfmul.vf v24, v24, fa0
vfcvt.rtz.x.f.v v4, v24
vwadd.wv v24, v8, v4
vsetvli zero, zero, e64, m8, ta, ma
vmul.vx v24, v24, t2
vadd.vx v0, v24, a4
vsetvli zero, zero, e8, m1, ta, ma
vluxei64.v v16, (a4), v24
vluxei64.v v17, (t5), v0
vluxei64.v v18, (s3), v0
vsetvli zero, zero, e32, m4, ta, ma
vadd.vx v20, v20, t4
addi a3, a3, -16
vsetvli zero, zero, e8, m1, ta, ma
vsseg3e8.v v16, (s0)
addi s0, s0, 48
bnez a3, .LBB0_11
```
</p>
</details>
https://github.com/llvm/llvm-project/pull/132190
More information about the llvm-commits
mailing list