[llvm] MTM: improve operand latency when missing sched info (PR #101389)

Wed Aug 7 03:58:34 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

TargetSchedModel::computeOperandLatency is supposed to return the exact latency between two MIs, although it is observed that InstrSchedModel and InstrItineraries are often unavailable in many real-world scenarios. When these two pieces of information are not available, the function returns an estimate that is much too conservative: the default def latency. MachineTraceMetrics is one of the callers affected quite badly by these conservative estimates. To improve the estimate, and let callers of MTM generate better code, offset the default def latency by the estiamted cycles elapsed between the def MI and use MI. Since we're trying to improve codegen in the case when no scheduling information is unavailable, it is impossible to determine the number of cycles elapsed between the two MIs, and we use the distance between them accounting for issue-width as a crude approximate. In practice, this improvement of one crude estimate by offseting it with another crude estimate leads to better codegen on average, and yields non-trivial gains on standard benchmarks.

---

Patch is 281.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101389.diff


77 Files Affected:

- (modified) llvm/lib/CodeGen/MachineTraceMetrics.cpp (+67-9) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll (+22-22) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll (+6-6) 
- (modified) llvm/test/CodeGen/RISCV/addcarry.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll (+10-10) 
- (modified) llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll (+14-14) 
- (modified) llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll (+28-28) 
- (modified) llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll (+7-7) 
- (modified) llvm/test/CodeGen/RISCV/compress.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/copysign-casts.ll (+4-4) 
- (modified) llvm/test/CodeGen/RISCV/div-pow2.ll (+5-5) 
- (modified) llvm/test/CodeGen/RISCV/float-intrinsics.ll (+10-10) 
- (modified) llvm/test/CodeGen/RISCV/iabs.ll (+20-20) 
- (modified) llvm/test/CodeGen/RISCV/machine-combiner.mir (+5-4) 
- (modified) llvm/test/CodeGen/RISCV/misched-load-clustering.ll (+3-3) 
- (modified) llvm/test/CodeGen/RISCV/mul.ll (+25-25) 
- (modified) llvm/test/CodeGen/RISCV/neg-abs.ll (+6-6) 
- (modified) llvm/test/CodeGen/RISCV/reduction-formation.ll (+42-42) 
- (modified) llvm/test/CodeGen/RISCV/rv32e.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rv32zba.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rv64e.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rv64zba.ll (+13-11) 
- (modified) llvm/test/CodeGen/RISCV/rvv/compressstore.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll (+16-16) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll (+117-117) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll (+5-5) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll (+3-3) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll (+13-13) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll (+7-7) 
- (modified) llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/srem-lkk.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll (+21-21) 
- (modified) llvm/test/CodeGen/RISCV/srem-vector-lkk.ll (+33-33) 
- (modified) llvm/test/CodeGen/RISCV/urem-lkk.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll (+14-14) 
- (modified) llvm/test/CodeGen/RISCV/urem-vector-lkk.ll (+19-19) 
- (modified) llvm/test/CodeGen/RISCV/xaluo.ll (+3-3) 
- (modified) llvm/test/CodeGen/RISCV/xtheadmac.ll (+3-3) 
- (modified) llvm/test/CodeGen/RISCV/xtheadmemidx.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll (+194-197) 
- (modified) llvm/test/CodeGen/X86/early-ifcvt-remarks.ll (+25-25) 
- (modified) llvm/test/CodeGen/X86/fold-tied-op.ll (+33-33) 
- (modified) llvm/test/CodeGen/X86/horizontal-sum.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/is_fpclass.ll (+31-59) 
- (modified) llvm/test/CodeGen/X86/lea-opt-cse4.ll (+10-8) 
- (modified) llvm/test/CodeGen/X86/machine-cp.ll (+35-35) 
- (modified) llvm/test/CodeGen/X86/madd.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/masked_gather_scatter.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll (+16-16) 
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-256.ll (+82-82) 
- (modified) llvm/test/CodeGen/X86/mul-constant-result.ll (+75-80) 
- (modified) llvm/test/CodeGen/X86/mul-i512.ll (+19-19) 
- (modified) llvm/test/CodeGen/X86/mul64.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/pr62653.ll (+43-43) 
- (modified) llvm/test/CodeGen/X86/rotate-multi.ll (+12-12) 
- (modified) llvm/test/CodeGen/X86/sad.ll (+27-26) 
- (modified) llvm/test/CodeGen/X86/sext-vsetcc.ll (+35-34) 
- (modified) llvm/test/CodeGen/X86/smul_fix.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/statepoint-live-in.ll (+27-27) 
- (modified) llvm/test/CodeGen/X86/statepoint-regs.ll (+27-27) 
- (modified) llvm/test/CodeGen/X86/ucmp.ll (+148-148) 
- (modified) llvm/test/CodeGen/X86/umul-with-overflow.ll (+11-12) 
- (modified) llvm/test/CodeGen/X86/umul_fix.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll (+25-28) 
- (modified) llvm/test/CodeGen/X86/v8i1-masks.ll (+72-72) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/win-smallparams.ll (+16-16) 
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+12-12) 
- (modified) llvm/test/CodeGen/X86/xmulo.ll (+19-19) 


``````````diff

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index bf3add010574b8..c3afba23628130 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -761,6 +762,64 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
   }
 }
 
+/// Estimates the number of cycles elapsed between DefMI and UseMI if they're
+/// non-null and in the same BasicBlock. Returns std::nullopt when UseMI is in a
+/// different MBB than DefMI, or when it is a dangling MI.
+static std::optional<unsigned>
+estimateDefUseCycles(const TargetSchedModel &Sched, const MachineInstr *DefMI,
+                     const MachineInstr *UseMI) {
+  if (!DefMI || !UseMI || DefMI == UseMI)
+    return 0;
+  const MachineBasicBlock *ParentBB = DefMI->getParent();
+  if (ParentBB != UseMI->getParent())
+    return std::nullopt;
+
+  const auto DefIt =
+      llvm::find_if(ParentBB->instrs(),
+                    [DefMI](const MachineInstr &MI) { return DefMI == &MI; });
+  const auto UseIt =
+      llvm::find_if(ParentBB->instrs(),
+                    [UseMI](const MachineInstr &MI) { return UseMI == &MI; });
+  assert(std::distance(DefIt, UseIt) > 0 &&
+         "Def expected to appear before use");
+  unsigned NumMicroOps = 0;
+  for (auto It = DefIt; It != UseIt; ++It) {
+    // In some cases, UseMI is a dangling MI beyond the end of the MBB.
+    if (It.isEnd())
+      return std::nullopt;
+
+    NumMicroOps += Sched.getNumMicroOps(&*It);
+  }
+  return NumMicroOps / Sched.getIssueWidth();
+}
+
+/// Wraps Sched.computeOperandLatency, accounting for the case when
+/// InstrSchedModel and InstrItineraries are not available: in this case,
+/// Sched.computeOperandLatency returns DefaultDefLatency, which is a very rough
+/// approximate; to improve this approximate, offset it by the approximate
+/// cycles elapsed from DefMI to UseMI (since the MIs could be re-ordered by the
+/// scheduler, and we don't have this information, this cannot be known
+/// exactly). When scheduling information is available,
+/// Sched.computeOperandLatency returns a much better estimate (especially if
+/// UseMI is non-null), so we just return that.
+static unsigned computeOperandLatency(const TargetSchedModel &Sched,
+                                      const MachineInstr *DefMI,
+                                      unsigned DefOperIdx,
+                                      const MachineInstr *UseMI,
+                                      unsigned UseOperIdx) {
+  assert(DefMI && "Non-null DefMI expected");
+  if (!Sched.hasInstrSchedModel() && !Sched.hasInstrItineraries()) {
+    unsigned DefaultDefLatency = Sched.getInstrInfo()->defaultDefLatency(
+        *Sched.getMCSchedModel(), *DefMI);
+    std::optional<unsigned> DefUseCycles =
+        estimateDefUseCycles(Sched, DefMI, UseMI);
+    if (!DefUseCycles || DefaultDefLatency <= DefUseCycles)
+      return 0;
+    return DefaultDefLatency - *DefUseCycles;
+  }
+  return Sched.computeOperandLatency(DefMI, DefOperIdx, UseMI, UseOperIdx);
+}
+
 /// The length of the critical path through a trace is the maximum of two path
 /// lengths:
 ///
@@ -813,8 +872,8 @@ updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
     unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
     // Add latency if DefMI is a real instruction. Transients get latency 0.
     if (!Dep.DefMI->isTransient())
-      DepCycle += MTM.SchedModel
-        .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp);
+      DepCycle += computeOperandLatency(MTM.SchedModel, Dep.DefMI, Dep.DefOp,
+                                        &UseMI, Dep.UseOp);
     Cycle = std::max(Cycle, DepCycle);
   }
   // Remember the instruction depth.
@@ -929,8 +988,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
       if (!MI.isTransient()) {
         // We may not know the UseMI of this dependency, if it came from the
         // live-in list. SchedModel can handle a NULL UseMI.
-        DepHeight += SchedModel.computeOperandLatency(&MI, MO.getOperandNo(),
-                                                      I->MI, I->Op);
+        DepHeight += computeOperandLatency(SchedModel, &MI, MO.getOperandNo(),
+                                           I->MI, I->Op);
       }
       Height = std::max(Height, DepHeight);
       // This regunit is dead above MI.
@@ -963,10 +1022,9 @@ static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
                           unsigned UseHeight, MIHeightMap &Heights,
                           const TargetSchedModel &SchedModel,
                           const TargetInstrInfo *TII) {
-  // Adjust height by Dep.DefMI latency.
   if (!Dep.DefMI->isTransient())
-    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI,
-                                                  Dep.UseOp);
+    UseHeight += computeOperandLatency(SchedModel, Dep.DefMI, Dep.DefOp, &UseMI,
+                                       Dep.UseOp);
 
   // Update Heights[DefMI] to be the maximum height seen.
   MIHeightMap::iterator I;
@@ -1192,8 +1250,8 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr &PHI) const {
   unsigned DepCycle = getInstrCycles(*Dep.DefMI).Depth;
   // Add latency if DefMI is a real instruction. Transients get latency 0.
   if (!Dep.DefMI->isTransient())
-    DepCycle += TE.MTM.SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
-                                                        &PHI, Dep.UseOp);
+    DepCycle += computeOperandLatency(TE.MTM.SchedModel, Dep.DefMI, Dep.DefOp,
+                                      &PHI, Dep.UseOp);
   return DepCycle;
 }
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
index 5c42fefb95b39f..69261126cd8b0e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
@@ -94,15 +94,15 @@ define i7 @bitreverse_i7(i7 %x) {
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    slli a2, a0, 2
 ; RV32-NEXT:    andi a2, a2, 16
+; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    andi a0, a0, 127
-; RV32-NEXT:    andi a3, a0, 8
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    andi a2, a0, 8
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    srli a2, a0, 2
 ; RV32-NEXT:    andi a2, a2, 4
-; RV32-NEXT:    srli a3, a0, 4
-; RV32-NEXT:    andi a3, a3, 2
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    andi a2, a2, 2
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    srli a0, a0, 6
 ; RV32-NEXT:    or a0, a1, a0
@@ -117,15 +117,15 @@ define i7 @bitreverse_i7(i7 %x) {
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    slli a2, a0, 2
 ; RV64-NEXT:    andi a2, a2, 16
+; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    andi a0, a0, 127
-; RV64-NEXT:    andi a3, a0, 8
-; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    andi a2, a0, 8
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    srliw a2, a0, 2
 ; RV64-NEXT:    andi a2, a2, 4
-; RV64-NEXT:    srliw a3, a0, 4
-; RV64-NEXT:    andi a3, a3, 2
-; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a2, a0, 4
+; RV64-NEXT:    andi a2, a2, 2
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    srliw a0, a0, 6
 ; RV64-NEXT:    or a0, a1, a0
@@ -145,24 +145,24 @@ define i24 @bitreverse_i24(i24 %x) {
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    lui a1, 1048335
 ; RV32-NEXT:    addi a1, a1, 240
-; RV32-NEXT:    and a3, a1, a2
-; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    and a3, a0, a1
+; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    srli a3, a3, 4
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    lui a1, 1047757
 ; RV32-NEXT:    addi a1, a1, -820
-; RV32-NEXT:    and a3, a1, a2
-; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    and a3, a0, a1
+; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    lui a1, 1047211
 ; RV32-NEXT:    addi a1, a1, -1366
-; RV32-NEXT:    and a2, a1, a2
-; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    and a3, a0, a1
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    srli a2, a2, 1
 ; RV32-NEXT:    slli a0, a0, 1
 ; RV32-NEXT:    and a0, a0, a1
@@ -179,24 +179,24 @@ define i24 @bitreverse_i24(i24 %x) {
 ; RV64-NEXT:    or a0, a0, a1
 ; RV64-NEXT:    lui a1, 1048335
 ; RV64-NEXT:    addi a1, a1, 240
-; RV64-NEXT:    and a3, a1, a2
-; RV64-NEXT:    and a3, a0, a3
+; RV64-NEXT:    and a3, a0, a1
+; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    srliw a3, a3, 4
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    or a0, a3, a0
 ; RV64-NEXT:    lui a1, 1047757
 ; RV64-NEXT:    addi a1, a1, -820
-; RV64-NEXT:    and a3, a1, a2
-; RV64-NEXT:    and a3, a0, a3
+; RV64-NEXT:    and a3, a0, a1
+; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    srliw a3, a3, 2
 ; RV64-NEXT:    slli a0, a0, 2
 ; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    or a0, a3, a0
 ; RV64-NEXT:    lui a1, 1047211
 ; RV64-NEXT:    addiw a1, a1, -1366
-; RV64-NEXT:    and a2, a1, a2
-; RV64-NEXT:    and a2, a0, a2
+; RV64-NEXT:    and a3, a0, a1
+; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    srliw a2, a2, 1
 ; RV64-NEXT:    slliw a0, a0, 1
 ; RV64-NEXT:    and a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index d55adf371119b5..5723c4b9197a6a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -1266,8 +1266,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32-NEXT:    sw a3, 4(sp)
 ; RV32-NEXT:    lw a2, 0(a2)
 ; RV32-NEXT:    add a0, a0, s0
-; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1319,8 +1319,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV64-NEXT:    sd a3, 8(sp)
 ; RV64-NEXT:    lw a2, 0(a2)
 ; RV64-NEXT:    add a0, a0, s0
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a2
 ; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 96
@@ -1371,8 +1371,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw a3, -16(s0)
 ; RV32-WITHFP-NEXT:    lw a2, 0(a2)
 ; RV32-WITHFP-NEXT:    add a0, a0, s1
-; RV32-WITHFP-NEXT:    add a1, a1, a2
 ; RV32-WITHFP-NEXT:    add a0, a0, a1
+; RV32-WITHFP-NEXT:    add a0, a0, a2
 ; RV32-WITHFP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-WITHFP-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-WITHFP-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1427,8 +1427,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV64-WITHFP-NEXT:    sd a3, -32(s0)
 ; RV64-WITHFP-NEXT:    lw a2, 0(a2)
 ; RV64-WITHFP-NEXT:    add a0, a0, s1
-; RV64-WITHFP-NEXT:    add a1, a1, a2
-; RV64-WITHFP-NEXT:    addw a0, a0, a1
+; RV64-WITHFP-NEXT:    add a0, a0, a1
+; RV64-WITHFP-NEXT:    addw a0, a0, a2
 ; RV64-WITHFP-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64-WITHFP-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64-WITHFP-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll
index 3a4163a8bb50f9..053b98755417b2 100644
--- a/llvm/test/CodeGen/RISCV/addcarry.ll
+++ b/llvm/test/CodeGen/RISCV/addcarry.ll
@@ -18,9 +18,9 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32-NEXT:    sltu a7, a4, a6
 ; RISCV32-NEXT:    sltu a5, a6, a5
 ; RISCV32-NEXT:    mulhu a6, a0, a3
-; RISCV32-NEXT:    mulhu t0, a1, a2
-; RISCV32-NEXT:    add a6, a6, t0
 ; RISCV32-NEXT:    add a5, a6, a5
+; RISCV32-NEXT:    mulhu a6, a1, a2
+; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    add a5, a5, a7
 ; RISCV32-NEXT:    mul a6, a1, a3
 ; RISCV32-NEXT:    add a5, a5, a6
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index 634ed45044ee21..672625c182d0b5 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -227,8 +227,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    addi a5, a5, 1
 ; RV32IA-NEXT:    sltu a7, a7, a1
 ; RV32IA-NEXT:    neg a7, a7
-; RV32IA-NEXT:    and a5, a5, a3
 ; RV32IA-NEXT:    and a5, a7, a5
+; RV32IA-NEXT:    and a5, a5, a3
 ; RV32IA-NEXT:    sll a5, a5, a0
 ; RV32IA-NEXT:    and a7, a6, a4
 ; RV32IA-NEXT:    or a7, a7, a5
@@ -307,8 +307,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    addi a6, a6, 1
 ; RV64IA-NEXT:    sltu t0, t0, a1
 ; RV64IA-NEXT:    negw t0, t0
-; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    and a6, t0, a6
+; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    sllw a6, a6, a0
 ; RV64IA-NEXT:    and a4, a4, a5
 ; RV64IA-NEXT:    or a6, a4, a6
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
index 278187f62cd75e..8bcdb059a95fbc 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
@@ -94,15 +94,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-FPELIM-LABEL: callee_aligned_stack:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 0(a2)
-; RV32I-FPELIM-NEXT:    lw a1, 8(sp)
+; RV32I-FPELIM-NEXT:    lw a1, 20(sp)
 ; RV32I-FPELIM-NEXT:    lw a2, 0(sp)
-; RV32I-FPELIM-NEXT:    lw a3, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a3, 8(sp)
 ; RV32I-FPELIM-NEXT:    lw a4, 16(sp)
 ; RV32I-FPELIM-NEXT:    add a0, a0, a7
-; RV32I-FPELIM-NEXT:    add a1, a2, a1
-; RV32I-FPELIM-NEXT:    add a0, a0, a1
-; RV32I-FPELIM-NEXT:    add a3, a4, a3
+; RV32I-FPELIM-NEXT:    add a0, a0, a2
 ; RV32I-FPELIM-NEXT:    add a0, a0, a3
+; RV32I-FPELIM-NEXT:    add a0, a0, a4
+; RV32I-FPELIM-NEXT:    add a0, a0, a1
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_aligned_stack:
@@ -112,15 +112,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 0(a2)
-; RV32I-WITHFP-NEXT:    lw a1, 8(s0)
+; RV32I-WITHFP-NEXT:    lw a1, 20(s0)
 ; RV32I-WITHFP-NEXT:    lw a2, 0(s0)
-; RV32I-WITHFP-NEXT:    lw a3, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a3, 8(s0)
 ; RV32I-WITHFP-NEXT:    lw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    add a0, a0, a7
-; RV32I-WITHFP-NEXT:    add a1, a2, a1
-; RV32I-WITHFP-NEXT:    add a0, a0, a1
-; RV32I-WITHFP-NEXT:    add a3, a4, a3
+; RV32I-WITHFP-NEXT:    add a0, a0, a2
 ; RV32I-WITHFP-NEXT:    add a0, a0, a3
+; RV32I-WITHFP-NEXT:    add a0, a0, a4
+; RV32I-WITHFP-NEXT:    add a0, a0, a1
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 231ed159ab2061..4906cc8eb73a53 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -87,16 +87,16 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-FPELIM-NEXT:    andi a0, a0, 255
 ; RV32I-FPELIM-NEXT:    slli a1, a1, 16
 ; RV32I-FPELIM-NEXT:    srli a1, a1, 16
-; RV32I-FPELIM-NEXT:    add a0, a0, a2
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
+; RV32I-FPELIM-NEXT:    add a0, a0, a2
 ; RV32I-FPELIM-NEXT:    xor a1, a4, t1
 ; RV32I-FPELIM-NEXT:    xor a2, a3, a7
 ; RV32I-FPELIM-NEXT:    or a1, a2, a1
 ; RV32I-FPELIM-NEXT:    seqz a1, a1
+; RV32I-FPELIM-NEXT:    add a0, a1, a0
 ; RV32I-FPELIM-NEXT:    add a0, a0, a5
 ; RV32I-FPELIM-NEXT:    add a0, a0, a6
 ; RV32I-FPELIM-NEXT:    add a0, a0, t0
-; RV32I-FPELIM-NEXT:    add a0, a1, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_many_scalars:
@@ -110,16 +110,16 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-WITHFP-NEXT:    andi a0, a0, 255
 ; RV32I-WITHFP-NEXT:    slli a1, a1, 16
 ; RV32I-WITHFP-NEXT:    srli a1, a1, 16
-; RV32I-WITHFP-NEXT:    add a0, a0, a2
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
+; RV32I-WITHFP-NEXT:    add a0, a0, a2
 ; RV32I-WITHFP-NEXT:    xor a1, a4, t1
 ; RV32I-WITHFP-NEXT:    xor a2, a3, a7
 ; RV32I-WITHFP-NEXT:    or a1, a2, a1
 ; RV32I-WITHFP-NEXT:    seqz a1, a1
+; RV32I-WITHFP-NEXT:    add a0, a1, a0
 ; RV32I-WITHFP-NEXT:    add a0, a0, a5
 ; RV32I-WITHFP-NEXT:    add a0, a0, a6
 ; RV32I-WITHFP-NEXT:    add a0, a0, t0
-; RV32I-WITHFP-NEXT:    add a0, a1, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16
@@ -614,15 +614,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-FPELIM-LABEL: callee_aligned_stack:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 0(a2)
-; RV32I-FPELIM-NEXT:    lw a1, 8(sp)
+; RV32I-FPELIM-NEXT:    lw a1, 20(sp)
 ; RV32I-FPELIM-NEXT:    lw a2, 0(sp)
-; RV32I-FPELIM-NEXT:    lw a3, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a3, 8(sp)
 ; RV32I-FPELIM-NEXT:    lw a4, 16(sp)
 ; RV32I-FPELIM-NEXT:    add a0, a0, a7
-; RV32I-FPELIM-NEXT:    add a1, a2, a1
-; RV32I-FPELIM-NEXT:    add a0, a0, a1
-; RV32I-FPELIM-NEXT:    add a3, a4, a3
+; RV32I-FPELIM-NEXT:    add a0, a0, a2
 ; RV32I-FPELIM-NEXT:    add a0, a0, a3
+; RV32I-FPELIM-NEXT:    add a0, a0, a4
+; RV32I-FPELIM-NEXT:    add a0, a0, a1
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_aligned_stack:
@@ -632,15 +632,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 0(a2)
-; RV32I-WITHFP-NEXT:    lw a1, 8(s0)
+; RV32I-WITHFP-NEXT:    lw a1, 20(s0)
 ; RV32I-WITHFP-NEXT:    lw a2, 0(s0)
-; RV32I-WITHFP-NEXT:    lw a3, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a3, 8(s0)
 ; RV32I-WITHFP-NEXT:    lw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    add a0, a0, a7
-; RV32I-WITHFP-NEXT:    add a1, a2, a1
-; RV32I-WITHFP-NEXT:    add a0, a0, a1
-; RV32I-WITHFP-NEXT:    add a3, a4, a3
+; RV32I-WITHFP-NEXT:    add a0, a0, a2
 ; RV32I-WITHFP-NEXT:    add a0, a0, a3
+; RV32I-WITHFP-NEXT:    add a0, a0, a4
+; RV32I-WITHFP-NEXT:    add a0, a0, a1
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index d08cf577b1bdd3..69691869997666 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -529,16 +529,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-FPELIM-LABEL: callee_aligned_stack:
 ; ILP32E-FPELIM:       # %bb.0:
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a2)
-; ILP32E-FPELIM-NEXT:    lw a1, 12(sp)
+; ILP32E-FPELIM-NEXT:    lw a1, 24(sp)
 ; ILP32E-FPELIM-NEXT:    lw a2, 4(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 8(sp)
-; ILP32E-FPELIM-NEXT:    lw a4, 24(sp)
+; ILP32E-FPELIM-NEXT:    lw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    lw a5, 20(sp)
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a2
-; ILP32E-FPELIM-NEXT:    add a1, a3, a1
-; ILP32E-FPELIM-NEXT:    add a0, a0, a1
-; ILP32E-FPELIM-NEXT:    add a4, a5, a4
+; ILP32E-FPELIM-NEXT:    add a0, a0, a3
 ; ILP32E-FP...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/101389