[llvm] [RISCV] Enable load clustering by default (PR #73789)

Wed Nov 29 04:28:26 PST 2023

llvmbot wrote:



@llvm/pr-subscribers-backend-risc-v

@llvm/pr-subscribers-backend-aarch64

Author: Alex Bradbury (asb)

<details>
<summary>Changes</summary>

[RISCV] Enable load clustering by default

Also tweaks the heuristic to cluster if operations are within a cache
line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does
something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not
sure if that's intentionally set to 512 bytes or if the division is in
error.

Posting for comment and for people to test on their workloads, feedback
on ideas for a tweaked heuristic etc.

Stacks on top of #73778.


---

Patch is 662.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73789.diff


45 Files Affected:

- (modified) llvm/include/llvm/CodeGen/TargetInstrInfo.h (+6) 
- (modified) llvm/lib/CodeGen/MachineScheduler.cpp (+9-5) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+3-2) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp (+4-1) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+2) 
- (modified) llvm/lib/Target/PowerPC/PPCInstrInfo.cpp (+3-2) 
- (modified) llvm/lib/Target/PowerPC/PPCInstrInfo.h (+2) 
- (modified) llvm/lib/Target/RISCV/RISCVInstrInfo.cpp (+6-5) 
- (modified) llvm/lib/Target/RISCV/RISCVInstrInfo.h (+2) 
- (modified) llvm/lib/Target/RISCV/RISCVTargetMachine.cpp (+3-13) 
- (modified) llvm/test/CodeGen/RISCV/add-before-shl.ll (+13-13) 
- (modified) llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll (+360-360) 
- (modified) llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll (+248-248) 
- (modified) llvm/test/CodeGen/RISCV/callee-saved-gprs.ll (+522-522) 
- (modified) llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll (+44-44) 
- (modified) llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/iabs.ll (+52-52) 
- (modified) llvm/test/CodeGen/RISCV/idiv_large.ll (+2-4) 
- (modified) llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll (+8-8) 
- (modified) llvm/test/CodeGen/RISCV/legalize-fneg.ll (+4-4) 
- (modified) llvm/test/CodeGen/RISCV/llvm.exp10.ll (+13-13) 
- (modified) llvm/test/CodeGen/RISCV/mul.ll (+30-30) 
- (modified) llvm/test/CodeGen/RISCV/nontemporal.ll (+710-710) 
- (modified) llvm/test/CodeGen/RISCV/push-pop-popret.ll (+1002-1002) 
- (modified) llvm/test/CodeGen/RISCV/reduction-formation.ll (+36-36) 
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll (+17-17) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll (+11-11) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll (+14-14) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll (+8-8) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll (+132-132) 
- (modified) llvm/test/CodeGen/RISCV/rvv/pr63596.ll (+6-6) 
- (modified) llvm/test/CodeGen/RISCV/shifts.ll (+259-241) 
- (modified) llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll (+91-91) 
- (modified) llvm/test/CodeGen/RISCV/srem-vector-lkk.ll (+107-107) 
- (modified) llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll (+49-49) 
- (modified) llvm/test/CodeGen/RISCV/unaligned-load-store.ll (+28-29) 
- (modified) llvm/test/CodeGen/RISCV/urem-vector-lkk.ll (+83-83) 
- (modified) llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll (+1073-1065) 
- (modified) llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll (+1911-1883) 
- (modified) llvm/test/CodeGen/RISCV/xtheadmempair.ll (+7-7) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 665b7449ddb820a..4ec46f9bde1adf2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1496,12 +1496,18 @@ class TargetInstrInfo : public MCInstrInfo {
   /// to TargetPassConfig::createMachineScheduler() to have an effect.
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
+  /// \p Offset1 and \p Offset2 are the byte offsets for the memory
+  /// operations.
+  /// \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate if the offset is
+  /// scaled by a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store
   /// cluster if this hook returns true.
   /// \p NumBytes is the number of bytes that will be loaded from all the
   /// clustered loads if this hook returns true.
   virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                   int64_t Offset1, bool OffsetIsScalable1,
                                    ArrayRef<const MachineOperand *> BaseOps2,
+                                   int64_t Offset2, bool OffsetIsScalable2,
                                    unsigned ClusterSize,
                                    unsigned NumBytes) const {
     llvm_unreachable("target did not implement shouldClusterMemOps()");
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4add33ba0996af0..cd5fe71ef0c1ad1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1698,11 +1698,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
     unsigned Width;
+    bool OffsetIsScalable;
 
     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
-              int64_t Offset, unsigned Width)
+              int64_t Offset, bool OffsetIsScalable, unsigned Width)
         : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
-          Width(Width) {}
+          OffsetIsScalable(OffsetIsScalable), Width(Width) {}
 
     static bool Compare(const MachineOperand *const &A,
                         const MachineOperand *const &B) {
@@ -1831,8 +1832,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
           SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
     }
 
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes))
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpa.Offset,
+                                  MemOpa.OffsetIsScalable, MemOpb.BaseOps,
+                                  MemOpb.Offset, MemOpb.OffsetIsScalable,
+                                  ClusterLength, CurrentClusterBytes))
       continue;
 
     SUnit *SUa = MemOpa.SU;
@@ -1899,7 +1902,8 @@ void BaseMemOpClusterMutation::collectMemOpRecords(
     unsigned Width;
     if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
                                            OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+      MemOpRecords.push_back(
+          MemOpInfo(&SU, BaseOps, Offset, OffsetIsScalable, Width));
 
       LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
                         << Offset << ", OffsetIsScalable: " << OffsetIsScalable
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e97f17e3f49c587..6b49e17528ada74 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4230,8 +4230,9 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
 ///
 /// Only called for LdSt for which getMemOperandWithOffset returns true.
 bool AArch64InstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
   const MachineOperand &BaseOp1 = *BaseOps1.front();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index cc588cdad6b8e5a..65e5fb49536da24 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -179,7 +179,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                            int64_t &MinOffset, int64_t &MaxOffset);
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 50f8ad4433c6d5c..442ae4dd7b34fe1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -232,7 +232,10 @@ class SIInsertHardClauses : public MachineFunctionPass {
               // scheduler it limits the size of the cluster to avoid increasing
               // register pressure too much, but this pass runs after register
               // allocation so there is no need for that kind of limit.
-              !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+              // We also lie about the Offset and OffsetIsScalable parameters,
+              // as they aren't used in the SIInstrInfo implementation.
+              !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false,
+                                        2, 2)))) {
           // Finish the current clause.
           Changed |= emitClause(CI, SII);
           CI = ClauseInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d6912544f..0a06fa88b6b1025 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -541,7 +541,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 }
 
 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                      int64_t Offset1, bool OffsetIsScalable1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
+                                      int64_t Offset2, bool OffsetIsScalable2,
                                       unsigned ClusterSize,
                                       unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index eba817756e9c58e..6da4f74dfe5f3ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -234,7 +234,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       const TargetRegisterInfo *TRI) const final;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 6784049348b1638..0de795d9d5e812e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2878,8 +2878,9 @@ static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
 }
 
 bool PPCInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
 
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 31e9859a41739a1..0c9ad607418ecc7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -530,7 +530,9 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   /// Returns true if the two given memory operations should be scheduled
   /// adjacent.
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2918e5654db4f9f..cf399f37d255c29 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2266,8 +2266,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 }
 
 bool RISCVInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t Offset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
   // should not be clustered
@@ -2281,9 +2282,9 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  // A cache line is typically 64 bytes, so cluster if the memory ops are on
+  // the same or a neighbouring cache line.
+  return std::abs(Offset1 - Offset2) < 64;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0954286a419bdd5..7e1d3f31180650d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -158,7 +158,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
       const TargetRegisterInfo *TRI) const override;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b6c194f03f54209..0954fbb8314c6fa 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -95,11 +95,6 @@ static cl::opt<bool>
                         cl::desc("Enable Split RegisterAlloc for RVV"),
                         cl::init(false));
 
-static cl::opt<bool> EnableMISchedLoadClustering(
-    "riscv-misched-load-clustering", cl::Hidden,
-    cl::desc("Enable load clustering in the machine scheduler"),
-    cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -350,15 +345,10 @@ class RISCVPassConfig : public TargetPassConfig {
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
-    ScheduleDAGMILive *DAG = nullptr;
-    if (EnableMISchedLoadClustering) {
-      DAG = createGenericSchedLive(C);
-      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-    }
-    if (ST.hasMacroFusion()) {
-      DAG = DAG ? DAG : createGenericSchedLive(C);
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    if (ST.hasMacroFusion())
       DAG->addMutation(createRISCVMacroFusionDAGMutation());
-    }
     return DAG;
   }
 
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa955..3695a8a7f60862f 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -200,25 +200,25 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    lw a6, 4(a1)
+; RV32C-NEXT:    lw a6, 8(a1)
 ; RV32C-NEXT:    c.lw a3, 12(a1)
-; RV32C-NEXT:    c.lw a4, 0(a1)
-; RV32C-NEXT:    c.lw a1, 8(a1)
+; RV32C-NEXT:    c.lw a2, 4(a1)
+; RV32C-NEXT:    c.lw a1, 0(a1)
 ; RV32C-NEXT:    c.lui a5, 16
 ; RV32C-NEXT:    c.add a3, a5
 ; RV32C-NEXT:    c.slli a3, 3
-; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a3, a5
-; RV32C-NEXT:    srli a5, a4, 29
-; RV32C-NEXT:    slli a2, a6, 3
-; RV32C-NEXT:    c.or a2, a5
 ; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    c.or a3, a5
+; RV32C-NEXT:    srli a5, a1, 29
+; RV32C-NEXT:    slli a4, a2, 3
+; RV32C-NEXT:    c.or a4, a5
+; RV32C-NEXT:    c.srli a2, 29
+; RV32C-NEXT:    c.slli a6, 3
+; RV32C-NEXT:    or a2, a6, a2
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a5
-; RV32C-NEXT:    c.slli a4, 3
-; RV32C-NEXT:    c.sw a4, 0(a0)
-; RV32C-NEXT:    c.sw a1, 8(a0)
-; RV32C-NEXT:    c.sw a2, 4(a0)
+; RV32C-NEXT:    c.sw a1, 0(a0)
+; RV32C-NEXT:    c.sw a2, 8(a0)
+; RV32C-NEXT:    c.sw a4, 4(a0)
 ; RV32C-NEXT:    c.sw a3, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 7111316931f19b7..7aeaaab68a208cb 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -25,69 +25,69 @@ define void @callee() nounwind {
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
 ; ILP32-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    flw fa1, 16(a1)
-; ILP32-NEXT:    flw fa0, 20(a1)
-; ILP32-NEXT:    flw ft0, 24(a1)
-; ILP32-NEXT:    flw ft1, 28(a1)
-; ILP32-NEXT:    flw ft2, 32(a1)
-; ILP32-NEXT:    flw ft3, 36(a1)
-; ILP32-NEXT:    flw ft4, 40(a1)
-; ILP32-NEXT:    flw ft5, 44(a1)
-; ILP32-NEXT:    flw ft6, 48(a1)
-; ILP32-NEXT:    flw ft7, 52(a1)
-; ILP32-NEXT:    flw fa6, 56(a1)
-; ILP32-NEXT:    flw fa7, 60(a1)
-; ILP32-NEXT:    flw ft8, 64(a1)
-; ILP32-NEXT:    flw ft9, 68(a1)
-; ILP32-NEXT:    flw ft10, 72(a1)
-; ILP32-NEXT:    flw ft11, 76(a1)
-; ILP32-NEXT:    flw fs0, 80(a1)
-; ILP32-NEXT:    flw fs1, 84(a1)
-; ILP32-NEXT:    flw fs2, 88(a1)
-; ILP32-NEXT:    flw fs3, 92(a1)
-; ILP32-NEXT:    flw fs4, 96(a1)
-; ILP32-NEXT:    flw fs5, 100(a1)
-; ILP32-NEXT:    flw fs6, 104(a1)
-; ILP32-NEXT:    flw fs7, 108(a1)
+; ILP32-NEXT:    flw fa4, 16(a1)
+; ILP32-NEXT:    flw fa3, 20(a1)
+; ILP32-NEXT:    flw fa2, 24(a1)
+; ILP32-NEXT:    flw fa1, 28(a1)
+; ILP32-NEXT:    flw fa0, 32(a1)
+; ILP32-NEXT:    flw ft0, 36(a1)
+; ILP32-NEXT:    flw ft1, 40(a1)
+; ILP32-NEXT:    flw ft2, 44(a1)
+; ILP32-NEXT:    flw ft3, 48(a1)
+; ILP32-NEXT:    flw ft4, 52(a1)
+; ILP32-NEXT:    flw ft5, 56(a1)
+; ILP32-NEXT:    flw ft6, 60(a1)
+; ILP32-NEXT:    flw ft7, 64(a1)
+; ILP32-NEXT:    flw fa6, 68(a1)
+; ILP32-NEXT:    flw fa7, 72(a1)
+; ILP32-NEXT:    flw ft8, 76(a1)
+; ILP32-NEXT:    flw ft9, 80(a1)
+; ILP32-NEXT:    flw ft10, 84(a1)
+; ILP32-NEXT:    flw ft11, 88(a1)
+; ILP32-NEXT:    flw fs0, 92(a1)
+; ILP32-NEXT:    flw fs1, 96(a1)
+; ILP32-NEXT:    flw fs2, 100(a1)
+; ILP32-NEXT:    flw fs3, 104(a1)
+; ILP32-NEXT:    flw fs4, 108(a1)
+; ILP32-NEXT:    flw fs5, 112(a1)
+; ILP32-NEXT:    flw fs6, 116(a1)
+; ILP32-NEXT:    flw fs7, 120(a1)
 ; ILP32-NEXT:    flw fs8, 124(a1)
-; ILP32-NEXT:    flw fs9, 120(a1)
-; ILP32-NEXT:    flw fs10, 116(a1)
-; ILP32-NEXT:    flw fs11, 112(a1)
+; ILP32-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32-NEXT:    fsw fs8, 124(a1)
-; ILP32-NEXT:    fsw fs9, 120(a1)
-; ILP32-NEXT:    fsw fs10, 116(a1)
-; ILP32-NEXT:    fsw fs11, 112(a1)
-; ILP32-NEXT:    fsw fs7, 108(a1)
-; ILP32-NEXT:    fsw fs6, 104(a1)
-; ILP32-NEXT:    fsw fs5, 100(a1)
-; ILP32-NEXT:    fsw fs4, 96(a1)
-; ILP32-NEXT:    fsw fs3, 92(a1)
-; ILP32-NEXT:    fsw fs2, 88(a1)
-; ILP32-NEXT:    fsw fs1, 84(a1)
-; ILP32-NEXT:    fsw fs0, 80(a1)
-; ILP32-NEXT:    fsw ft11, 76(a1)
-; ILP32-NEXT:    fsw ft10, 72(a1)
-; ILP32-NEXT:    fsw ft9, 68(a1)
-; ILP32-NEXT:    fsw ft8, 64(a1)
-; ILP32-NEXT:    fsw fa7, 60(a1)
-; ILP32-NEXT:    fsw fa6, 56(a1)
-; ILP32-NEXT:    fsw ft7, 52(a1)
-; ILP32-NEXT:    fsw ft6, 48(a1)
-; ILP32-NEXT:    fsw ft5, 44(a1)
-; ILP32-NEXT:    fsw ft4, 40(a1)
-; ILP32-NEXT:    fsw ft3, 36(a1)
-; ILP32-NEXT:    fsw ft2, 32(a1)
-; ILP32-NEXT:    fsw ft1, 28(a1)
-; ILP32-NEXT:    fsw ft0, 24(a1)
-; ILP32-NEXT:    fsw fa0, 20(a1)
-; ILP32-NEXT:    fsw fa1, 16(a1)
-; ILP32-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32-NEXT:    fsw fs7, 120(a1)
+; ILP32-NEXT:    fsw fs6, 116(a1)
+; ILP32-NEXT:    fsw fs5, 112(a1)
+; ILP32-NEXT:    fsw fs4, 108(a1)
+; ILP32-NEXT:    fsw fs3, 104(a1)
+; ILP32-NEXT:    fsw fs2, 100(a1)
+; ILP32-NEXT:    fsw fs1, 96(a1)
+; ILP32-NEXT:    fsw fs0, 92(a1)
+; ILP32-NEXT:    fsw ft11, 88(a1)
+; ILP32-NEXT:    fsw ft10, 84(a1)
+; ILP32-NEXT:    fsw ft9, 80(a1)
+; ILP32-NEXT:    fsw ft8, 76(a1)
+; ILP32-NEXT:    fsw fa7, 72(a1)
+; ILP32-NEXT:    fsw fa6, 68(a1)
+; ILP32-NEXT:    fsw ft7, 64(a1)
+; ILP32-NEXT:    fsw ft6, 60(a1)
+; ILP32-NEXT:    fsw ft5, 56(a1)
+; ILP32-NEXT:    fsw ft4, 52(a1)
+; ILP32-NEXT:    fsw ft3, 48(a1)
+; ILP32-NEXT:    fsw ft2, 44(a1)
+; ILP32-NEXT:    fsw ft1, 40(a1)
+; ILP32-NEXT:    fsw ft0, 36(a1)
+; ILP32-NEXT:    fsw fa0, 32(a1)
+; ILP32-NEXT:    fsw fa1, 28(a1)
+; ILP32-NEXT:    fsw fa2, 24(a1)
+; ILP32-NEXT:    fsw fa3, 20(a1)
+; ILP32-NEXT:    fsw fa4, 16(a1)
+; ILP32-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
@@ -95,69 +95,69 @@ define void @callee() nounwind {
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
 ; LP64-NEXT:    flw fa5, %lo(var)(a0)
-; LP64-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    flw fa1, 16(a1)
-; LP64-NEXT:    flw fa0, 20(a1)
-; LP64-NEXT:    flw ft0, 24(a1)
-; LP64-NEXT:    flw ft1, 28(a1)
-; LP64-NEXT:    flw ft2, 32(a1)
-; LP64-NEXT:    flw ft3, 36(a1)
-; LP64-NEXT:    flw ft4, 40(a1)
-; LP64-NEXT:    flw ft5, 44(a1)
-; LP64-NEXT:    flw ft6, 48(a1)
-; LP64-NEXT:    flw ft7, 52(a1)
-; LP64-NEXT:    flw fa6, 56(a1)
-; LP64-NEXT:    flw fa7, 60(a1)
-; LP64-NEXT:    flw ft8, 64(a1)
-; LP64-NEXT:    flw ft9, 68(a1)
-; LP64-NEXT:    flw ft10, 72(a1)
-; LP64-NEXT:    flw ft11, 76(a1)
-; LP64-NEXT:    flw fs0, 80(a1)
-; LP64-NEXT:    flw fs1, 84(a1)
-; LP64-NEXT:    flw fs2, 88(a1)
-; LP64-NEXT:    flw fs3, 92(a1)
-; LP64-NEXT:    flw fs4, 96(a1)
-; LP64-NEXT:    flw fs5, 100(a1)
-; LP64-NEXT:  ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/73789