[PATCH] D147000: [RISCV] Cost model for general case of single vector permute

Luke Lau via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 28 03:22:16 PDT 2023


luke accepted this revision.
luke added a comment.
This revision is now accepted and ready to land.

LGTM, this matches up with what I'm seeing generating random masks for shuffles.

Just a small observation: Could we improve codegen and use vrgatherei16 for i8 and n > 256? The generated code at the moment for a v512i8 vector shuffle is less than ideal

  v512:                                   # @v512
  	.cfi_startproc
  # %bb.0:
  	lui	a0, %hi(.LCPI1_0)
  	ld	a0, %lo(.LCPI1_0)(a0)
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vmv.s.x	v12, a0
  	vsetivli	zero, 2, e64, m1, tu, ma
  	vmv1r.v	v0, v12
  	vslideup.vi	v0, v12, 1
  	vsetivli	zero, 3, e64, m1, tu, ma
  	vslideup.vi	v0, v12, 2
  	vsetivli	zero, 4, e64, m1, tu, ma
  	vslideup.vi	v0, v12, 3
  	vsetivli	zero, 5, e64, m1, tu, ma
  	vslideup.vi	v0, v12, 4
  	vsetivli	zero, 6, e64, m1, tu, ma
  	vslideup.vi	v0, v12, 5
  	vsetivli	zero, 7, e64, m1, tu, ma
  	vslideup.vi	v0, v12, 6
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vslideup.vi	v0, v12, 7
  	vsetivli	zero, 1, e8, m4, ta, ma
  	vslidedown.vi	v12, v8, 1
  	vmv.x.s	a1, v12
  	li	a0, 512
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmv.v.x	v12, a1
  	vsetivli	zero, 1, e8, m4, ta, ma
  	vslidedown.vi	v16, v8, 5
  	vmv.x.s	a1, v16
  	lui	a2, %hi(.LCPI1_1)
  	ld	a2, %lo(.LCPI1_1)(a2)
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmerge.vxm	v12, v12, a1, v0
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vmv.s.x	v16, a2
  	vsetivli	zero, 2, e64, m1, tu, ma
  	vmv1r.v	v0, v16
  	vslideup.vi	v0, v16, 1
  	vsetivli	zero, 3, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 2
  	vsetivli	zero, 4, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 3
  	vsetivli	zero, 5, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 4
  	vsetivli	zero, 6, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 5
  	vsetivli	zero, 7, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 6
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vslideup.vi	v0, v16, 7
  	vsetivli	zero, 1, e8, m4, ta, ma
  	vslidedown.vi	v16, v8, 4
  	vmv.x.s	a1, v16
  	lui	a2, %hi(.LCPI1_2)
  	ld	a2, %lo(.LCPI1_2)(a2)
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmerge.vxm	v12, v12, a1, v0
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vmv.s.x	v16, a2
  	vsetivli	zero, 2, e64, m1, tu, ma
  	vmv1r.v	v0, v16
  	vslideup.vi	v0, v16, 1
  	vsetivli	zero, 3, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 2
  	vsetivli	zero, 4, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 3
  	vsetivli	zero, 5, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 4
  	vsetivli	zero, 6, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 5
  	vsetivli	zero, 7, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 6
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vslideup.vi	v0, v16, 7
  	vsetivli	zero, 1, e8, m4, ta, ma
  	vslidedown.vi	v16, v8, 7
  	vmv.x.s	a1, v16
  	lui	a2, %hi(.LCPI1_3)
  	ld	a2, %lo(.LCPI1_3)(a2)
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmerge.vxm	v12, v12, a1, v0
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vmv.s.x	v16, a2
  	vsetivli	zero, 2, e64, m1, tu, ma
  	vmv1r.v	v0, v16
  	vslideup.vi	v0, v16, 1
  	vsetivli	zero, 3, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 2
  	vsetivli	zero, 4, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 3
  	vsetivli	zero, 5, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 4
  	vsetivli	zero, 6, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 5
  	vsetivli	zero, 7, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 6
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vslideup.vi	v0, v16, 7
  	vsetivli	zero, 1, e8, m4, ta, ma
  	vslidedown.vi	v16, v8, 3
  	vmv.x.s	a1, v16
  	lui	a2, %hi(.LCPI1_4)
  	ld	a2, %lo(.LCPI1_4)(a2)
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmerge.vxm	v12, v12, a1, v0
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vmv.s.x	v16, a2
  	vsetivli	zero, 2, e64, m1, tu, ma
  	vmv1r.v	v0, v16
  	vslideup.vi	v0, v16, 1
  	vsetivli	zero, 3, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 2
  	vsetivli	zero, 4, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 3
  	vsetivli	zero, 5, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 4
  	vsetivli	zero, 6, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 5
  	vsetivli	zero, 7, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 6
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vslideup.vi	v0, v16, 7
  	vsetivli	zero, 0, e8, m4, ta, ma
  	vmv.x.s	a1, v8
  	lui	a2, %hi(.LCPI1_5)
  	ld	a2, %lo(.LCPI1_5)(a2)
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmerge.vxm	v12, v12, a1, v0
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vmv.s.x	v16, a2
  	vsetivli	zero, 2, e64, m1, tu, ma
  	vmv1r.v	v0, v16
  	vslideup.vi	v0, v16, 1
  	vsetivli	zero, 3, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 2
  	vsetivli	zero, 4, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 3
  	vsetivli	zero, 5, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 4
  	vsetivli	zero, 6, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 5
  	vsetivli	zero, 7, e64, m1, tu, ma
  	vslideup.vi	v0, v16, 6
  	vsetivli	zero, 8, e64, m1, ta, ma
  	vslideup.vi	v0, v16, 7
  	vsetivli	zero, 1, e8, m4, ta, ma
  	vslidedown.vi	v8, v8, 2
  	vmv.x.s	a1, v8
  	vsetvli	zero, a0, e8, m4, ta, ma
  	vmerge.vxm	v8, v12, a1, v0
  	ret

Below is v256i8 for comparison, as we are modelling:

  v256:                                   # @v256
  	.cfi_startproc
  # %bb.0:
  	lui	a0, %hi(.LCPI0_0)
  	addi	a0, a0, %lo(.LCPI0_0)
  	li	a1, 32
  	vsetvli	zero, a1, e64, m2, ta, ma
  	vlse64.v	v12, (a0), zero
  	li	a0, 256
  	vsetvli	zero, a0, e8, m2, ta, ma
  	vrgather.vv	v10, v8, v12
  	vmv.v.v	v8, v10
  	ret


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D147000/new/

https://reviews.llvm.org/D147000



More information about the llvm-commits mailing list