[llvm] [RISCV] Fix missing scaling by LMUL in cost model (PR #73342)

Luke Lau via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 28 21:20:02 PST 2023


lukel97 wrote:

>From the llvm test suite, this reduces the VF in a couple of places, e.g. in SingleSource/Regression/C/gcc-c-torture/execute/loop-2d.c:

```diff
--- build.head/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-2d.dir/loop-2d.s   2023-11-27 04:14:09.994315098 +0000
+++ build/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-2d.dir/loop-2d.s        2023-11-27 04:14:04.982461178 +0000
@@ -12,7 +12,7 @@
 .Lpcrel_hi0:
        auipc   a1, %pcrel_hi(a)
        addi    a2, a1, %pcrel_lo(.Lpcrel_hi0)
-       li      a1, 16
+       li      a1, 8
        add     a3, a2, a3
        bgeu    a0, a1, .LBB0_3
 # %bb.2:
@@ -21,38 +21,35 @@
 .LBB0_3:                                # %vector.ph
        slli    a4, a0, 32
        srli    a4, a4, 32
-       andi    a5, a4, -16
+       andi    a5, a4, -8
        slli    a1, a5, 2
        sub     a3, a3, a1
        subw    a1, a0, a5
-       vsetivli        zero, 8, e32, m2, ta, ma
-       vid.v   v8
-       vrsub.vi        v8, v8, 0
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vid.v   v9
+       vrsub.vi        v8, v9, 0
        vadd.vx v8, v8, a0
        slli    a0, a0, 2
        add     a0, a0, a2
-       addi    a0, a0, -64
+       addi    a0, a0, -32
        li      a6, 3
        addi    a7, a2, -3
-       addi    t0, a2, -27
-       vsetvli zero, zero, e16, m1, ta, ma
-       vid.v   v10
-       vrsub.vi        v10, v10, 7
+       addi    t0, a2, -15
+       vrsub.vi        v9, v9, 3
        mv      t1, a5
 .LBB0_4:                                # %vector.body
                                         # =>This Inner Loop Header: Depth=1
-       vsetvli zero, zero, e32, m2, ta, ma
-       vmul.vx v12, v8, a6
-       vadd.vx v14, v12, a7
-       vadd.vx v12, v12, t0
-       vrgatherei16.vv v16, v14, v10
-       addi    t2, a0, 32
-       vse32.v v16, (t2)
-       vrgatherei16.vv v14, v12, v10
-       vse32.v v14, (a0)
-       vadd.vi v8, v8, -16
-       addi    t1, t1, -16
-       addi    a0, a0, -64
+       vmul.vx v10, v8, a6
+       vadd.vx v11, v10, a7
+       vadd.vx v10, v10, t0
+       vrgather.vv     v12, v11, v9
+       addi    t2, a0, 16
+       vse32.v v12, (t2)
+       vrgather.vv     v11, v10, v9
+       vse32.v v11, (a0)
+       vadd.vi v8, v8, -8
+       addi    t1, t1, -8
+       addi    a0, a0, -32
        bnez    t1, .LBB0_4
 # %bb.5:                                # %middle.block
        beq     a5, a4, .LBB0_8
```

I presume in this specific case its coming from experimental_stepvector being more expensive now.

(As an aside, should we be able to PRE on that `vsetvli zero, zero, e32, m2, ta, ma` in the loop body?)

https://github.com/llvm/llvm-project/pull/73342


More information about the llvm-commits mailing list