[llvm] [RISCV] Use vsetivli instead of `x0,x0` form to retain SEW/LMUL when AVL is imm (PR #169307)

Tue Nov 25 22:22:33 PST 2025

================
@@ -47,6 +47,11 @@ static cl::opt<bool> EnsureWholeVectorRegisterMoveValidVTYPE(
              "vill is cleared"),
     cl::init(true));
 
+static cl::opt<bool> UseVsetivliForImmAVL(
+    DEBUG_TYPE "-use-vsetivli-for-imm-avl", cl::Hidden,
+    cl::desc("Use vsetivli to replace x0,x0 form when AVL is an immediate."),
+    cl::init(true));
----------------
wangpc-pp wrote:

I just wrote a test and there is a stable 0.1%-0.3% performance gain:
```c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// clang-format off
#define LOOP2 LOOP;LOOP
#define LOOP4 LOOP2;LOOP2
#define LOOP8 LOOP4;LOOP4
#define LOOP16 LOOP8;LOOP8
#define LOOP32 LOOP16;LOOP16
#define LOOP64 LOOP32;LOOP32
#define LOOP128 LOOP64;LOOP64
#define LOOP256 LOOP128;LOOP128
#define LOOP512 LOOP256;LOOP256
#define LOOP1024 LOOP512;LOOP512
// clang-format on

__attribute__((naked)) void vsetvlix0x0() {
#define LOOP                                                                   \
  asm("vsetivli zero, 16, e32, m4, ta, ma"); /*vl=16*/                         \
  asm("vadd.vv v0, v4, v0");                                                   \
  asm("vsetivli zero, 4, e32, m1, ta, ma");   /*vl=4*/                         \
  asm("vsetvli zero, zero, e64, m2, ta, ma"); /*vl=4*/                         \
  asm("vadd.vv v8, v10, v8");                                                  \
  asm("vsetvli zero, zero, e32, m1, ta, ma"); /*vl=4*/                         \
  asm("vadd.vv v12, v13, v12")

  LOOP1024;

#undef LOOP

  asm("ret");
}

__attribute__((naked)) void vsetivli() {
#define LOOP                                                                   \
  asm("vsetivli zero, 16, e32, m4, ta, ma"); /*vl=16*/                         \
  asm("vadd.vv v0, v4, v0");                                                   \
  asm("vsetivli zero, 4, e32, m1, ta, ma"); /*vl=4*/                           \
  asm("vsetivli zero, 4, e64, m2, ta, ma"); /*vl=4*/                           \
  asm("vadd.vv v8, v10, v8");                                                  \
  asm("vsetivli zero, 4, e32, m1, ta, ma"); /*vl=4*/                           \
  asm("vadd.vv v12, v13, v12")

  LOOP1024;

#undef LOOP

  asm("ret");
}

void bench(const char *name, int times, void (*func)()) {
  clock_t start = clock();
  for (int i = 0; i < times; i++) {
    func();
  }
  clock_t end = clock();
  printf("Time of %s: %ld\n", name, end - start);
}

int main(int argc, char **argv) {
  int times = argc == 1 ? 1000000 : atoi(argv[1]);
  bench("vsetvlix0x0", times, vsetvlix0x0);
  bench("vsetivli", times, vsetivli);
  return 0;
}
```
Please double check that and I don't know if this test is representative.

https://github.com/llvm/llvm-project/pull/169307