[llvm] [RISCV] Reorder the vector register allocation sequence. (PR #69290)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 16 23:25:01 PDT 2023
yanming123456 wrote:
> > a example source:
> > ```
> > void vec(float *restrict out,
> > float *in1,
> > float *in2,
> > float *in3,
> > size_t n) {
> > size_t vl;
> > for (size_t i = 0; i < n; i += vl) {
> > vl = __riscv_vsetvl_e32m8(n);
> > vfloat32m8_t v1 = __riscv_vle32_v_f32m8(&in1[i], vl);
> > vfloat32m8_t v2 = __riscv_vle32_v_f32m8(&in2[i], vl);
> > vbool4_t true_mask = __riscv_vmfgt(v1, v2, vl);
> > vbool4_t false_mask = __riscv_vmnot(true_mask, vl);
> > vfloat32m8_t v3 = __riscv_vle32_v_f32m8_m(false_mask, &in3[i], vl);
> > vfloat32m8_t val = __riscv_vfsub(true_mask, v1, v2, vl);
> > val = __riscv_vfsub_mu(false_mask, val, v2, v3, vl);
> > __riscv_vse32(&out[i], val, vl);
> > }
> > }
> > ```
> >
> >
> > before:
> > ```
> > vec: # @vec
> > # %bb.0: # %entry
> > beqz a4, .LBB0_4
> > # %bb.1: # %for.body.lr.ph
> > addi sp, sp, -16
> > csrr a5, vlenb
> > slli a5, a5, 4
> > sub sp, sp, a5
> > li a5, 0
> > vsetvli a6, a4, e32, m8, ta, ma
> > slli a7, a6, 2
> > .LBB0_2: # %for.body
> > # =>This Inner Loop Header: Depth=1
> > vsetvli zero, a6, e32, m8, ta, ma
> > vle32.v v24, (a1)
> > addi t0, sp, 16
> > vs8r.v v24, (t0) # Unknown-size Folded Spill
> > vle32.v v16, (a2)
> > vmflt.vv v8, v16, v24
> > vmnot.m v9, v8
> > vmv1r.v v0, v9
> > vle32.v v24, (a3), v0.t
> > csrr t0, vlenb
> > slli t0, t0, 3
> > add t0, t0, sp
> > addi t0, t0, 16
> > vs8r.v v24, (t0) # Unknown-size Folded Spill
> > vmv1r.v v0, v8
> > addi t0, sp, 16
> > vl8r.v v24, (t0) # Unknown-size Folded Reload
> > vfsub.vv v24, v24, v16, v0.t
> > vsetvli zero, zero, e32, m8, ta, mu
> > vmv1r.v v0, v9
> > csrr t0, vlenb
> > slli t0, t0, 3
> > add t0, t0, sp
> > addi t0, t0, 16
> > vl8r.v v8, (t0) # Unknown-size Folded Reload
> > vfsub.vv v24, v16, v8, v0.t
> > vse32.v v24, (a0)
> > add a5, a5, a6
> > add a0, a0, a7
> > add a3, a3, a7
> > add a2, a2, a7
> > add a1, a1, a7
> > bltu a5, a4, .LBB0_2
> > # %bb.3:
> > csrr a0, vlenb
> > slli a0, a0, 4
> > add sp, sp, a0
> > addi sp, sp, 16
> > .LBB0_4: # %for.cond.cleanup
> > ret
> > ```
> >
> >
> > after:
> > ```
> > vec: # @vec
> > # %bb.0: # %entry
> > beqz a4, .LBB0_3
> > # %bb.1: # %for.body.lr.ph
> > li a5, 0
> > vsetvli a6, a4, e32, m8, ta, ma
> > slli a7, a6, 2
> > .LBB0_2: # %for.body
> > # =>This Inner Loop Header: Depth=1
> > vsetvli zero, a6, e32, m8, ta, mu
> > vle32.v v16, (a1)
> > vle32.v v8, (a2)
> > vmflt.vv v1, v8, v16
> > vmnot.m v2, v1
> > vmv1r.v v0, v2
> > vle32.v v24, (a3), v0.t
> > vmv1r.v v0, v1
> > vfsub.vv v16, v16, v8, v0.t
> > vmv1r.v v0, v2
> > vfsub.vv v16, v8, v24, v0.t
> > vse32.v v16, (a0)
> > add a5, a5, a6
> > add a0, a0, a7
> > add a3, a3, a7
> > add a2, a2, a7
> > add a1, a1, a7
> > bltu a5, a4, .LBB0_2
> > .LBB0_3: # %for.cond.cleanup
> > ret
> > ```
>
> I think this can be added to the test cases.
There are many optimized outcomes in existing examples, such as:
> > a example source:
> > ```
> > void vec(float *restrict out,
> > float *in1,
> > float *in2,
> > float *in3,
> > size_t n) {
> > size_t vl;
> > for (size_t i = 0; i < n; i += vl) {
> > vl = __riscv_vsetvl_e32m8(n);
> > vfloat32m8_t v1 = __riscv_vle32_v_f32m8(&in1[i], vl);
> > vfloat32m8_t v2 = __riscv_vle32_v_f32m8(&in2[i], vl);
> > vbool4_t true_mask = __riscv_vmfgt(v1, v2, vl);
> > vbool4_t false_mask = __riscv_vmnot(true_mask, vl);
> > vfloat32m8_t v3 = __riscv_vle32_v_f32m8_m(false_mask, &in3[i], vl);
> > vfloat32m8_t val = __riscv_vfsub(true_mask, v1, v2, vl);
> > val = __riscv_vfsub_mu(false_mask, val, v2, v3, vl);
> > __riscv_vse32(&out[i], val, vl);
> > }
> > }
> > ```
> >
> >
> > before:
> > ```
> > vec: # @vec
> > # %bb.0: # %entry
> > beqz a4, .LBB0_4
> > # %bb.1: # %for.body.lr.ph
> > addi sp, sp, -16
> > csrr a5, vlenb
> > slli a5, a5, 4
> > sub sp, sp, a5
> > li a5, 0
> > vsetvli a6, a4, e32, m8, ta, ma
> > slli a7, a6, 2
> > .LBB0_2: # %for.body
> > # =>This Inner Loop Header: Depth=1
> > vsetvli zero, a6, e32, m8, ta, ma
> > vle32.v v24, (a1)
> > addi t0, sp, 16
> > vs8r.v v24, (t0) # Unknown-size Folded Spill
> > vle32.v v16, (a2)
> > vmflt.vv v8, v16, v24
> > vmnot.m v9, v8
> > vmv1r.v v0, v9
> > vle32.v v24, (a3), v0.t
> > csrr t0, vlenb
> > slli t0, t0, 3
> > add t0, t0, sp
> > addi t0, t0, 16
> > vs8r.v v24, (t0) # Unknown-size Folded Spill
> > vmv1r.v v0, v8
> > addi t0, sp, 16
> > vl8r.v v24, (t0) # Unknown-size Folded Reload
> > vfsub.vv v24, v24, v16, v0.t
> > vsetvli zero, zero, e32, m8, ta, mu
> > vmv1r.v v0, v9
> > csrr t0, vlenb
> > slli t0, t0, 3
> > add t0, t0, sp
> > addi t0, t0, 16
> > vl8r.v v8, (t0) # Unknown-size Folded Reload
> > vfsub.vv v24, v16, v8, v0.t
> > vse32.v v24, (a0)
> > add a5, a5, a6
> > add a0, a0, a7
> > add a3, a3, a7
> > add a2, a2, a7
> > add a1, a1, a7
> > bltu a5, a4, .LBB0_2
> > # %bb.3:
> > csrr a0, vlenb
> > slli a0, a0, 4
> > add sp, sp, a0
> > addi sp, sp, 16
> > .LBB0_4: # %for.cond.cleanup
> > ret
> > ```
> >
> >
> > after:
> > ```
> > vec: # @vec
> > # %bb.0: # %entry
> > beqz a4, .LBB0_3
> > # %bb.1: # %for.body.lr.ph
> > li a5, 0
> > vsetvli a6, a4, e32, m8, ta, ma
> > slli a7, a6, 2
> > .LBB0_2: # %for.body
> > # =>This Inner Loop Header: Depth=1
> > vsetvli zero, a6, e32, m8, ta, mu
> > vle32.v v16, (a1)
> > vle32.v v8, (a2)
> > vmflt.vv v1, v8, v16
> > vmnot.m v2, v1
> > vmv1r.v v0, v2
> > vle32.v v24, (a3), v0.t
> > vmv1r.v v0, v1
> > vfsub.vv v16, v16, v8, v0.t
> > vmv1r.v v0, v2
> > vfsub.vv v16, v8, v24, v0.t
> > vse32.v v16, (a0)
> > add a5, a5, a6
> > add a0, a0, a7
> > add a3, a3, a7
> > add a2, a2, a7
> > add a1, a1, a7
> > bltu a5, a4, .LBB0_2
> > .LBB0_3: # %for.cond.cleanup
> > ret
> > ```
>
> I think this can be added to the test cases.
There are many optimized outcomes in existing examples, such as: llvm/test/CodeGen/RISCV/rvv/abs-vp.ll:607
https://github.com/llvm/llvm-project/pull/69290
More information about the llvm-commits
mailing list