[llvm] [RISCV] Optimize VRELOAD/VSPILL lowering if VLEN is known. (PR #74421)

Mon Dec 4 23:10:54 PST 2023

https://github.com/topperc created https://github.com/llvm/llvm-project/pull/74421

Instead of using VLENB and a shift, load (VLEN/8)*LMUL directly into a register. We could go further and use ADDI, but that would be more intrusive to the code structure.

My primary goal is to remove the read of VLENB which might be expensive if it's not optimized in hardware.

>From 31667a4497f189f4bc20195dc0939d0e03b626b9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 4 Dec 2023 22:59:01 -0800
Subject: [PATCH] [RISCV] Optimize VRELOAD/VSPILL lowering if VLEN is known.

Instead of using VLENB and a shift, (VLEN/8)*LMUL directly into
a register. We could go further and use ADDI, but that would be
more intrusive to the code structure.

My primary goal is to remove the read of VLENB which might be expensive
if it's not optimized in hardware.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |  40 ++++--
 .../CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll   | 116 ++++++++++++++++++
 .../CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll   | 116 ++++++++++++++++++
 3 files changed, 260 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 475ccc01df1fa..a3c19115bd317 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -299,12 +299,20 @@ void RISCVRegisterInfo::lowerVSPILL(MachineBasicBlock::iterator II) const {
                 "Unexpected subreg numbering");
 
   Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-  BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
-  uint32_t ShiftAmount = Log2_32(LMUL);
-  if (ShiftAmount != 0)
-    BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
-        .addReg(VL)
-        .addImm(ShiftAmount);
+  // Optimize for constant VLEN.
+  const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();
+  if (STI.getRealMinVLen() == STI.getRealMaxVLen()) {
+    const int64_t VLENB = STI.getRealMinVLen() / 8;
+    int64_t Offset = VLENB * LMUL;
+    STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset);
+  } else {
+    BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
+    uint32_t ShiftAmount = Log2_32(LMUL);
+    if (ShiftAmount != 0)
+      BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
+          .addReg(VL)
+          .addImm(ShiftAmount);
+  }
 
   Register SrcReg = II->getOperand(0).getReg();
   Register Base = II->getOperand(1).getReg();
@@ -368,12 +376,20 @@ void RISCVRegisterInfo::lowerVRELOAD(MachineBasicBlock::iterator II) const {
                 "Unexpected subreg numbering");
 
   Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-  BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
-  uint32_t ShiftAmount = Log2_32(LMUL);
-  if (ShiftAmount != 0)
-    BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
-        .addReg(VL)
-        .addImm(ShiftAmount);
+  // Optimize for constant VLEN.
+  const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();
+  if (STI.getRealMinVLen() == STI.getRealMaxVLen()) {
+    const int64_t VLENB = STI.getRealMinVLen() / 8;
+    int64_t Offset = VLENB * LMUL;
+    STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset);
+  } else {
+    BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
+    uint32_t ShiftAmount = Log2_32(LMUL);
+    if (ShiftAmount != 0)
+      BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
+          .addReg(VL)
+          .addImm(ShiftAmount);
+  }
 
   Register DestReg = II->getOperand(0).getReg();
   Register Base = II->getOperand(1).getReg();
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
index a8a165e549296..407c782d3377a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
@@ -3,6 +3,8 @@
 ; RUN:    | FileCheck --check-prefix=SPILL-O0 %s
 ; RUN: llc -mtriple=riscv32 -mattr=+v -mattr=+m -O2 < %s \
 ; RUN:    | FileCheck --check-prefix=SPILL-O2 %s
+; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -O2 < %s \
+; RUN:    | FileCheck --check-prefix=SPILL-O2-VLEN128 %s
 
 define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-LABEL: spill_zvlsseg_nxv1i32:
@@ -56,6 +58,28 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv1i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -32
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v9, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 32
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 1 x i32>,<vscale x 1 x i32>} @llvm.riscv.vlseg2.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, ptr %base, i32 %vl)
   call void asm sideeffect "",
@@ -116,6 +140,28 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv2i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -32
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v9, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 32
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.riscv.vlseg2.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, ptr %base, i32 %vl)
   call void asm sideeffect "",
@@ -179,6 +225,28 @@ define <vscale x 4 x i32> @spill_zvlsseg_nxv4i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv4i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -64
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v6, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 64
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 4 x i32>,<vscale x 4 x i32>} @llvm.riscv.vlseg2.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, ptr %base, i32 %vl)
   call void asm sideeffect "",
@@ -242,6 +310,28 @@ define <vscale x 8 x i32> @spill_zvlsseg_nxv8i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv8i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -128
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 64
+; SPILL-O2-VLEN128-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs4r.v v12, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 64
+; SPILL-O2-VLEN128-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 128
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 8 x i32>,<vscale x 8 x i32>} @llvm.riscv.vlseg2.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, ptr %base, i32 %vl)
   call void asm sideeffect "",
@@ -314,6 +404,32 @@ define <vscale x 4 x i32> @spill_zvlsseg3_nxv4i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg3_nxv4i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -96
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg3e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v6, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 96
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>} @llvm.riscv.vlseg3.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, ptr %base, i32 %vl)
   call void asm sideeffect "",
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
index a3c0ed47e113a..1c1544b4efa0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
@@ -3,6 +3,8 @@
 ; RUN:    | FileCheck --check-prefix=SPILL-O0 %s
 ; RUN: llc -mtriple=riscv64 -mattr=+v -mattr=+m -O2 < %s \
 ; RUN:    | FileCheck --check-prefix=SPILL-O2 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -O2 < %s \
+; RUN:    | FileCheck --check-prefix=SPILL-O2-VLEN128 %s
 
 define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-LABEL: spill_zvlsseg_nxv1i32:
@@ -56,6 +58,28 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv1i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -32
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v9, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 32
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 1 x i32>,<vscale x 1 x i32>} @llvm.riscv.vlseg2.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, ptr %base, i64 %vl)
   call void asm sideeffect "",
@@ -116,6 +140,28 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv2i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -32
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs1r.v v9, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 16
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 32
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.riscv.vlseg2.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, ptr %base, i64 %vl)
   call void asm sideeffect "",
@@ -179,6 +225,28 @@ define <vscale x 4 x i32> @spill_zvlsseg_nxv4i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv4i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -64
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v6, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 64
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 4 x i32>,<vscale x 4 x i32>} @llvm.riscv.vlseg2.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, ptr %base, i64 %vl)
   call void asm sideeffect "",
@@ -242,6 +310,28 @@ define <vscale x 8 x i32> @spill_zvlsseg_nxv8i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg_nxv8i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -128
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg2e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 64
+; SPILL-O2-VLEN128-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs4r.v v12, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 64
+; SPILL-O2-VLEN128-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 128
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 8 x i32>,<vscale x 8 x i32>} @llvm.riscv.vlseg2.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, ptr %base, i64 %vl)
   call void asm sideeffect "",
@@ -314,6 +404,32 @@ define <vscale x 4 x i32> @spill_zvlsseg3_nxv4i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
+;
+; SPILL-O2-VLEN128-LABEL: spill_zvlsseg3_nxv4i32:
+; SPILL-O2-VLEN128:       # %bb.0: # %entry
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -16
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, -96
+; SPILL-O2-VLEN128-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O2-VLEN128-NEXT:    vlseg3e32.v v8, (a0)
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
+; SPILL-O2-VLEN128-NEXT:    #APP
+; SPILL-O2-VLEN128-NEXT:    #NO_APP
+; SPILL-O2-VLEN128-NEXT:    addi a0, sp, 16
+; SPILL-O2-VLEN128-NEXT:    li a1, 32
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v6, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    add a0, a0, a1
+; SPILL-O2-VLEN128-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 96
+; SPILL-O2-VLEN128-NEXT:    addi sp, sp, 16
+; SPILL-O2-VLEN128-NEXT:    ret
 entry:
   %0 = tail call {<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>} @llvm.riscv.vlseg3.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, ptr %base, i64 %vl)
   call void asm sideeffect "",