[llvm] [MachineScheduler][RISCV] Release the pending queue base on condition (PR #125468)

via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 3 01:12:09 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-risc-v

Author: Piyou Chen (BeMg)

<details>
<summary>Changes</summary>

During scheduling, the SUnit will be pushed into the pending queue when a hazard occurs. For this reason, those SUnits will not be considered in register pressure measurement, which can cause register spilling in high register pressure regions.

This patch adds a hook to release nodes from the pending queue based on target register pressure information, and includes an option to control this feature. This may help avoid spill/reload operations for in-order cores in high register pressure regions.

```
-misched-release-pending-queue=<true|false>
```


---

Patch is 491.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125468.diff


65 Files Affected:

- (modified) llvm/include/llvm/CodeGen/MachineScheduler.h (+9) 
- (modified) llvm/include/llvm/CodeGen/TargetRegisterInfo.h (+17) 
- (modified) llvm/lib/CodeGen/MachineScheduler.cpp (+45) 
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp (+33) 
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.h (+7) 
- (modified) llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll (+126-122) 
- (modified) llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll (+154-148) 
- (modified) llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll (+37-73) 
- (modified) llvm/test/CodeGen/RISCV/rvv/calling-conv.ll (+3-3) 
- (modified) llvm/test/CodeGen/RISCV/rvv/expandload.ll (+232-316) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll (+4-4) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll (+116-112) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll (+104-100) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll (+7-7) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll (+7-7) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll (+295-545) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll (+164-208) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll (+344-436) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+371-322) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll (+174-146) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll (+112-112) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll (+7-8) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll (+21-21) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll (+18-18) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll (+8-8) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll (+39-41) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll (+16-26) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll (+40-40) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll (+16-26) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll (+16-26) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll (+40-40) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll (+5-5) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll (+5-5) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll (+24-80) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll (+49-50) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll (+18-34) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll (+18-34) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll (+103-126) 
- (modified) llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll (+14-14) 
- (modified) llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll (+13-45) 
- (modified) llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll (+4-4) 
- (modified) llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll (+151-171) 
- (modified) llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll (+36-36) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll (+10-10) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll (+26-135) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll (+26-119) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll (+17-18) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll (+7-7) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll (+3-3) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll (+31-31) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll (+17-18) 


``````````diff
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 660670ccdcd75b4..47809606ff40754 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1068,6 +1068,13 @@ class SchedBoundary {
   /// Dump the state of the information that tracks resource usage.
   void dumpReservedCycles() const;
   void dumpScheduledState() const;
+
+  void bumpCycleUntilReleaseSUFromPending(SUnit *SU) {
+    while (!Pending.empty() && llvm::find(Pending, SU) != Pending.end()) {
+      bumpCycle(CurrCycle + 1);
+      releasePending();
+    }
+  }
 };
 
 /// Base class for GenericScheduler. This class maintains information about
@@ -1262,6 +1269,8 @@ class GenericScheduler : public GenericSchedulerBase {
     BotCand.SU = nullptr;
   }
 
+  void bumpCycleUntilReleaseSUFromPending(bool IsTop);
+
   void registerRoots() override;
 
 protected:
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 114149ff53d850b..270b9cd8de1df58 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1190,6 +1190,23 @@ class TargetRegisterInfo : public MCRegisterInfo {
     return false;
   }
 
+  /// Based on the target and current register pressure information from the
+  /// Scheduler, determine whether to release the node in the pending queue
+  virtual bool
+  needReleasePendingQueue(MachineFunction &MF,
+                          ArrayRef<unsigned> MaxSetPressure) const {
+    return false;
+  }
+
+  /// For each SUnit, determine whether to release it
+  /// from the pending queue based on the register pressure changes
+  /// associated with that SUnit.
+  virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+                                             ArrayRef<unsigned> PSetID,
+                                             ArrayRef<int> UnitInc) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   /// Debug information queries.
 
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 393530f56cc27ee..586c8857bb199fc 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -161,6 +161,10 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
+static cl::opt<bool>
+    EnableReleasePendingQ("misched-release-pending-queue", cl::Hidden,
+                          cl::desc("Release the pending queue"),
+                          cl::init(true));
 static cl::opt<bool>
     ForceFastCluster("force-fast-cluster", cl::Hidden,
                      cl::desc("Switch to fast cluster algorithm with the lost "
@@ -3656,6 +3660,37 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
+void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) {
+  if (!DAG->isTrackingPressure())
+    return;
+  auto releasePending = [&](ReadyQueue &Q, const RegPressureTracker &RegP,
+                            ArrayRef<unsigned> MaxSetP, SchedBoundary &SchedB) {
+    for (SUnit *SU : Q) {
+      RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RegP);
+      CandPolicy TempPolicy;
+      SchedCandidate TryCand(TempPolicy);
+      initCandidate(TryCand, SU, IsTop, RegP, TempTracker);
+      PressureDiff PDiff = DAG->getPressureDiff(SU);
+      SmallVector<unsigned> PSetIDs;
+      SmallVector<int> UnitIncs;
+      for (const auto &PChange : PDiff) {
+        if (!PChange.isValid())
+          continue;
+        PSetIDs.push_back(PChange.getPSet());
+        UnitIncs.push_back(PChange.getUnitInc());
+      }
+      if (TRI->needReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
+        SchedB.bumpCycleUntilReleaseSUFromPending(SU);
+    }
+  };
+  if (IsTop)
+    releasePending(Top.Pending, DAG->getTopRPTracker(),
+                   DAG->getTopRPTracker().getPressure().MaxSetPressure, Top);
+  else
+    releasePending(Bot.Pending, DAG->getBotRPTracker(),
+                   DAG->getBotRPTracker().getPressure().MaxSetPressure, Bot);
+}
+
 /// Pick the best candidate node from either the top or bottom queue.
 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
@@ -3741,6 +3776,16 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
+
+  if (EnableReleasePendingQ && !RegionPolicy.OnlyBottomUp &&
+      TRI->needReleasePendingQueue(
+          DAG->MF, DAG->getTopRPTracker().getPressure().MaxSetPressure))
+    bumpCycleUntilReleaseSUFromPending(/*IsTop=*/true);
+  if (EnableReleasePendingQ && !RegionPolicy.OnlyTopDown &&
+      TRI->needReleasePendingQueue(
+          DAG->MF, DAG->getBotRPTracker().getPressure().MaxSetPressure))
+    bumpCycleUntilReleaseSUFromPending(/*IsTop=*/false);
+
   SUnit *SU;
   do {
     if (RegionPolicy.OnlyTopDown) {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b0a52698c1e9f10..91605c5acda0cb2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -954,3 +954,36 @@ bool RISCVRegisterInfo::getRegAllocationHints(
 
   return BaseImplRetVal;
 }
+
+bool RISCVRegisterInfo::needReleasePendingQueue(
+    MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
+  for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
+    // Consider only the RVV Register, as RVV spilling/reloading has higher
+    // potential costs than hazards.
+    if (!StringRef(getRegPressureSetName(Idx)).starts_with("VM") &&
+        !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+      continue;
+    const unsigned RVVRegPressureThreshold = 7;
+    if (MaxSetPressure[Idx] + RVVRegPressureThreshold >
+        getRegPressureSetLimit(MF, Idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RISCVRegisterInfo::needReleaseSUFromPendingQueue(
+    MachineFunction &MF, ArrayRef<unsigned> PSetID,
+    ArrayRef<int> UnitInc) const {
+  const int UnitIncRVVRegPressureThreshold = -3;
+  for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
+    // Consider only the RVV Register, as RVV spilling/reloading has higher
+    // potential costs than hazards.
+    if (!StringRef(getRegPressureSetName(PSetID[Idx])).starts_with("VM") &&
+        !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+      continue;
+    if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold)
+      return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8a..faf81b2d8b73d65 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,13 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   static bool isRVVRegClass(const TargetRegisterClass *RC) {
     return RISCVRI::isVRegClass(RC->TSFlags);
   }
+  bool
+  needReleasePendingQueue(MachineFunction &MF,
+                          ArrayRef<unsigned> MaxSetPressure) const override;
+
+  bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+                                     ArrayRef<unsigned> PSetID,
+                                     ArrayRef<int> UnitInc) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 1ed84316d4484cd..cd795f722676bd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -1137,38 +1137,38 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    addi a5, sp, 8
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsll.vx v0, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vand.vx v24, v24, a3
 ; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vand.vx v16, v8, a3
 ; RV32-NEXT:    vsll.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v0, (a5), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a1
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24
-; RV32-NEXT:    vand.vv v16, v8, v0
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a1
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 4d34621cd5f243c..8ae560c07e21016 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -2288,66 +2288,68 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v24, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v24, a5, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a6), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a5, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2497,40 +2499,40 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
 ; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    addi a6, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a3
 ; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v24, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v0, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v8, v24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v24, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
@@ -2673,66 +2675,68 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v24, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v24, a5, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a6), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/125468


More information about the llvm-commits mailing list