[llvm] [MachineScheduler][RISCV] Release the pending queue base on condition (PR #125468)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 3 01:12:09 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Piyou Chen (BeMg)
<details>
<summary>Changes</summary>
During scheduling, the SUnit will be pushed into the pending queue when a hazard occurs. For this reason, those SUnits will not be considered in register pressure measurement, which can cause register spilling in high register pressure regions.
This patch adds a hook to release nodes from the pending queue based on target register pressure information, and includes an option to control this feature. This may help avoid spill/reload operations for in-order cores in high register pressure regions.
```
-misched-release-pending-queue=<true|false>
```
---
Patch is 491.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125468.diff
65 Files Affected:
- (modified) llvm/include/llvm/CodeGen/MachineScheduler.h (+9)
- (modified) llvm/include/llvm/CodeGen/TargetRegisterInfo.h (+17)
- (modified) llvm/lib/CodeGen/MachineScheduler.cpp (+45)
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp (+33)
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.h (+7)
- (modified) llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll (+22-22)
- (modified) llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll (+126-122)
- (modified) llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll (+22-22)
- (modified) llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll (+154-148)
- (modified) llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll (+37-73)
- (modified) llvm/test/CodeGen/RISCV/rvv/calling-conv.ll (+3-3)
- (modified) llvm/test/CodeGen/RISCV/rvv/expandload.ll (+232-316)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll (+4-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll (+116-112)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll (+104-100)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll (+7-7)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll (+7-7)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll (+295-545)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll (+164-208)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll (+344-436)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll (+22-22)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll (+22-22)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+371-322)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll (+174-146)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll (+112-112)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll (+7-8)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll (+21-21)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll (+18-18)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll (+8-8)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll (+39-41)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll (+16-26)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll (+40-40)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll (+16-26)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll (+16-26)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll (+40-40)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll (+1-1)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll (+1-1)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll (+1-1)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll (+5-5)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll (+5-5)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll (+24-80)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll (+49-50)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll (+18-34)
- (modified) llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll (+18-34)
- (modified) llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll (+103-126)
- (modified) llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll (+14-14)
- (modified) llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll (+13-45)
- (modified) llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll (+4-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll (+151-171)
- (modified) llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll (+36-36)
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll (+10-10)
- (modified) llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll (+26-135)
- (modified) llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll (+26-119)
- (modified) llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll (+17-18)
- (modified) llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll (+7-7)
- (modified) llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll (+3-3)
- (modified) llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll (+31-31)
- (modified) llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll (+17-18)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 660670ccdcd75b4..47809606ff40754 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1068,6 +1068,13 @@ class SchedBoundary {
/// Dump the state of the information that tracks resource usage.
void dumpReservedCycles() const;
void dumpScheduledState() const;
+
+ void bumpCycleUntilReleaseSUFromPending(SUnit *SU) {
+ while (!Pending.empty() && llvm::find(Pending, SU) != Pending.end()) {
+ bumpCycle(CurrCycle + 1);
+ releasePending();
+ }
+ }
};
/// Base class for GenericScheduler. This class maintains information about
@@ -1262,6 +1269,8 @@ class GenericScheduler : public GenericSchedulerBase {
BotCand.SU = nullptr;
}
+ void bumpCycleUntilReleaseSUFromPending(bool IsTop);
+
void registerRoots() override;
protected:
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 114149ff53d850b..270b9cd8de1df58 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1190,6 +1190,23 @@ class TargetRegisterInfo : public MCRegisterInfo {
return false;
}
+ /// Based on the target and current register pressure information from the
+ /// Scheduler, determine whether to release the node in the pending queue
+ virtual bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const {
+ return false;
+ }
+
+ /// For each SUnit, determine whether to release it
+ /// from the pending queue based on the register pressure changes
+ /// associated with that SUnit.
+ virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ return false;
+ }
+
//===--------------------------------------------------------------------===//
/// Debug information queries.
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 393530f56cc27ee..586c8857bb199fc 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -161,6 +161,10 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
cl::desc("Enable memop clustering."),
cl::init(true));
+static cl::opt<bool>
+ EnableReleasePendingQ("misched-release-pending-queue", cl::Hidden,
+ cl::desc("Release the pending queue"),
+ cl::init(true));
static cl::opt<bool>
ForceFastCluster("force-fast-cluster", cl::Hidden,
cl::desc("Switch to fast cluster algorithm with the lost "
@@ -3656,6 +3660,37 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
}
}
+void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) {
+ if (!DAG->isTrackingPressure())
+ return;
+ auto releasePending = [&](ReadyQueue &Q, const RegPressureTracker &RegP,
+ ArrayRef<unsigned> MaxSetP, SchedBoundary &SchedB) {
+ for (SUnit *SU : Q) {
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RegP);
+ CandPolicy TempPolicy;
+ SchedCandidate TryCand(TempPolicy);
+ initCandidate(TryCand, SU, IsTop, RegP, TempTracker);
+ PressureDiff PDiff = DAG->getPressureDiff(SU);
+ SmallVector<unsigned> PSetIDs;
+ SmallVector<int> UnitIncs;
+ for (const auto &PChange : PDiff) {
+ if (!PChange.isValid())
+ continue;
+ PSetIDs.push_back(PChange.getPSet());
+ UnitIncs.push_back(PChange.getUnitInc());
+ }
+ if (TRI->needReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
+ SchedB.bumpCycleUntilReleaseSUFromPending(SU);
+ }
+ };
+ if (IsTop)
+ releasePending(Top.Pending, DAG->getTopRPTracker(),
+ DAG->getTopRPTracker().getPressure().MaxSetPressure, Top);
+ else
+ releasePending(Bot.Pending, DAG->getBotRPTracker(),
+ DAG->getBotRPTracker().getPressure().MaxSetPressure, Bot);
+}
+
/// Pick the best candidate node from either the top or bottom queue.
SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most
@@ -3741,6 +3776,16 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
+
+ if (EnableReleasePendingQ && !RegionPolicy.OnlyBottomUp &&
+ TRI->needReleasePendingQueue(
+ DAG->MF, DAG->getTopRPTracker().getPressure().MaxSetPressure))
+ bumpCycleUntilReleaseSUFromPending(/*IsTop=*/true);
+ if (EnableReleasePendingQ && !RegionPolicy.OnlyTopDown &&
+ TRI->needReleasePendingQueue(
+ DAG->MF, DAG->getBotRPTracker().getPressure().MaxSetPressure))
+ bumpCycleUntilReleaseSUFromPending(/*IsTop=*/false);
+
SUnit *SU;
do {
if (RegionPolicy.OnlyTopDown) {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b0a52698c1e9f10..91605c5acda0cb2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -954,3 +954,36 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
+
+bool RISCVRegisterInfo::needReleasePendingQueue(
+ MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
+ for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
+ // Consider only the RVV Register, as RVV spilling/reloading has higher
+ // potential costs than hazards.
+ if (!StringRef(getRegPressureSetName(Idx)).starts_with("VM") &&
+ !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ continue;
+ const unsigned RVVRegPressureThreshold = 7;
+ if (MaxSetPressure[Idx] + RVVRegPressureThreshold >
+ getRegPressureSetLimit(MF, Idx)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool RISCVRegisterInfo::needReleaseSUFromPendingQueue(
+ MachineFunction &MF, ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ const int UnitIncRVVRegPressureThreshold = -3;
+ for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
+ // Consider only the RVV Register, as RVV spilling/reloading has higher
+ // potential costs than hazards.
+ if (!StringRef(getRegPressureSetName(PSetID[Idx])).starts_with("VM") &&
+ !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ continue;
+ if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold)
+ return true;
+ }
+ return false;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8a..faf81b2d8b73d65 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,13 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
static bool isRVVRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags);
}
+ bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const override;
+
+ bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const override;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 1ed84316d4484cd..cd795f722676bd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -1137,38 +1137,38 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: li a1, 56
; RV32-NEXT: li a2, 40
; RV32-NEXT: lui a3, 16
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: addi a5, sp, 8
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vsrl.vx v24, v8, a2
-; RV32-NEXT: addi a0, a3, -256
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsll.vx v0, v8, a1
-; RV32-NEXT: vand.vx v24, v24, a0
+; RV32-NEXT: vand.vx v24, v24, a3
; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a3
; RV32-NEXT: vsll.vx v16, v16, a2
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a5), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a4
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: lui a1, 4080
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24
-; RV32-NEXT: vand.vv v16, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: vand.vx v8, v8, a1
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 4d34621cd5f243c..8ae560c07e21016 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -2288,66 +2288,68 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v24, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT: vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT: vand.vx v16, v8, a5, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -2497,40 +2499,40 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2
-; RV32-NEXT: addi a1, a3, -256
; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a1
+; RV32-NEXT: vand.vx v0, v0, a3
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v24, v8, a1
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v24, v8, a3
; RV32-NEXT: vsll.vx v24, v24, a4
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v24, (a6), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v24, a5
+; RV32-NEXT: vsrl.vi v24, v8, 8
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v0, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v24, v8, v24
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v24, v8, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v0, v8
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: lui a3, 349525
@@ -2673,66 +2675,68 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v24, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/125468
More information about the llvm-commits
mailing list