[llvm] [RISCV] Reduce minimum VL needed for vslidedown.vx in RISCVVLOptimizer (PR #168392)

Mon Nov 17 07:59:50 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Luke Lau (lukel97)

<details>
<summary>Changes</summary>

Whenever #149042 is relanded we will soon start EVL tail folding vectorized loops that have live-outs, e.g.:

```c
int f(int *x, int n) {
  for (int i = 0; i < n; i++) {
    int y = x[i] + 1;
    x[y] = y;
  }
  return y;
}
```

These are vectorized by extracting the last "active lane" in the loop's exit:

```llvm
loop:
  %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
  ...

exit:
  %lastidx = sub i64 %vl, 1
  %lastelt = extractelement <vscale x 4 x i32> %y, i64 %lastidx
```

Which in RISC-V translates to a vslidedown.vx with a VL of 1:

```llvm
bb.loop:
    %vl:gprnox0 = PseudoVSETVLI ...
    %y:vr = PseudoVADD_VI_M1 $noreg, %x, 1,  AVL=-1
    ...
bb.exit:
    %lastidx:gprnox0 = ADDI %vl, -1
    %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %y, %lastidx, AVL=1
```

However today we will fail to reduce the VL of %y in the loop and will end up with two extra VL toggles. The reason being that today RISCVVLOptimizer is conservative with vslidedown.vx as it can read the lanes of %y past its own VL. So in `getMinimumVLForUser` we say that vslidedown.vx demands the entirety of %y. 

One observation with the sequence above is that it only actually needs to read the first %vl lanes of %y, because the last lane of vs2 used is offset + 1. In this case, that's `%lastidx + 1 = %vl - 1 + 1 = %vl`. 

This PR teaches RISCVVLOptimizer about this case in `getMinimumVLForVSLIDEDOWN_VX`, and in doing so removes the VL toggles.

The one case that I had to think about for a bit was whenever `ADDI %vl, -1` wraps, i.e. when the resulting offset is all ones. This should always be larger than the largest VLMAX, so vs2 will be completely slid down and absent from the output. So we don't need to read anything from vs2. 

This patch on its own has no observable effect on llvm-test-suite or SPEC CPU 2017 w/ rva23u64 today. 

---
Full diff: https://github.com/llvm/llvm-project/pull/168392.diff


3 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp (+40-1) 
- (added) llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll (+44) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vl-opt.mir (+18) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 0a8838cbd45c7..5011b178a5770 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -62,7 +62,7 @@ struct DemandedVL {
 };
 
 class RISCVVLOptimizer : public MachineFunctionPass {
-  const MachineRegisterInfo *MRI;
+  MachineRegisterInfo *MRI;
   const MachineDominatorTree *MDT;
   const TargetInstrInfo *TII;
 
@@ -1392,6 +1392,41 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   return true;
 }
 
+/// Given a vslidedown.vx like:
+///
+/// %slideamt = ADDI %x, -1
+/// %v = PseudoVSLIDEDOWN_VX %passthru, %src, %slideamt, avl=1
+///
+/// %v will only read the first %slideamt + 1 lanes of %src, which = %x.
+/// This is a common case when lowering extractelement.
+///
+/// Note that if %x is 0, %slideamt will be all ones. In this case %src will be
+/// completely slid down and none of its lanes will be read (since %slideamt is
+/// greater than the largest VLMAX of 65536) so we can demand any minimum VL.
+static std::optional<DemandedVL>
+getMinimumVLForVSLIDEDOWN_VX(const MachineOperand &UserOp,
+                             const MachineRegisterInfo *MRI) {
+  const MachineInstr &MI = *UserOp.getParent();
+  if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VSLIDEDOWN_VX)
+    return std::nullopt;
+  // We're looking at what lanes are used from the src operand.
+  if (UserOp.getOperandNo() != 2)
+    return std::nullopt;
+  // For now, the AVL must be 1.
+  const MachineOperand &AVL = MI.getOperand(4);
+  if (!AVL.isImm() || AVL.getImm() != 1)
+    return std::nullopt;
+  // The slide amount must be %x - 1.
+  const MachineOperand &SlideAmt = MI.getOperand(3);
+  if (!SlideAmt.getReg().isVirtual())
+    return std::nullopt;
+  MachineInstr *SlideAmtDef = MRI->getUniqueVRegDef(SlideAmt.getReg());
+  if (SlideAmtDef->getOpcode() != RISCV::ADDI ||
+      SlideAmtDef->getOperand(2).getImm() != -AVL.getImm())
+    return std::nullopt;
+  return SlideAmtDef->getOperand(1);
+}
+
 DemandedVL
 RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
   const MachineInstr &UserMI = *UserOp.getParent();
@@ -1406,6 +1441,9 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
     return DemandedVL::vlmax();
   }
 
+  if (auto VL = getMinimumVLForVSLIDEDOWN_VX(UserOp, MRI))
+    return *VL;
+
   if (RISCVII::readsPastVL(
           TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
     LLVM_DEBUG(dbgs() << "  Abort because used by unsafe instruction\n");
@@ -1624,6 +1662,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
 
   // All our checks passed. We can reduce VL.
   VLOp.ChangeToRegister(CommonVL->getReg(), false);
+  MRI->constrainRegClass(CommonVL->getReg(), &RISCV::GPRNoX0RegClass);
   return true;
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll
new file mode 100644
index 0000000000000..cf15fad5533b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @loop_live_out(ptr %p, i64 %n) {
+; CHECK-LABEL: loop_live_out:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli a3, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a2)
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vse32.v v8, (a2)
+; CHECK-NEXT:    slli a2, a3, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    bnez a1, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a3
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %avl = phi i64 [%n, %entry], [%avl.next, %loop]
+  %gep = phi ptr [%p, %entry], [%gep.next, %loop]
+  %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
+  %x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %vl)
+  %y = add <vscale x 4 x i32> %x, splat (i32 1)
+  call void @llvm.vp.store(<vscale x 4 x i32> %y, ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %vl)
+  %vl.zext = zext i32 %vl to i64
+  %avl.next = sub i64 %avl, %vl.zext
+  %gep.next = getelementptr i32, ptr %p, i32 %vl
+  %ec = icmp eq i64 %avl.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %lastidx = sub i64 %vl.zext, 1
+  %lastelt = extractelement <vscale x 4 x i32> %y, i64 %lastidx
+  ret i32 %lastelt
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 4d6d0e122b1cf..ddd23f3d575d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -778,3 +778,21 @@ body: |
     ; CHECK: DBG_VALUE %0:vr
     DBG_VALUE %0:vr
 ...
+---
+name: vslidedown_vx
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x8
+    ; CHECK-LABEL: name: vslidedown_vx
+    ; CHECK: liveins: $x8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:gprnox0 = COPY $x8
+    ; CHECK-NEXT: %y:gprnox0 = ADDI %x, -1
+    ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %x, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:gpr = COPY $x8
+    %y:gprnox0 = ADDI %x, -1
+    %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
+    %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/168392