[llvm] [RISCV] Optimise spills/fills of FPR<->GPR moves (PR #78408)

Wed Jan 17 00:15:02 PST 2024

https://github.com/asb created https://github.com/llvm/llvm-project/pull/78408

If spilling the destination of a FPR<->GPR move, we can just store the source register instead. If filling the source of a FPR<->GPR move, we can just load the destination register instead. This avoids the fmv instruction because a GPR or FPR load/store can be used directly.

AArch64 and SystemZ implement a similar optimisation.

>From 7751aa58dbb70ae5bc80d007f12fc82181ee34a7 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 17 Jan 2024 06:47:59 +0000
Subject: [PATCH] [RISCV] Optimise spills/fills of FPR<->GPR moves

If spilling the destination of a FPR<->GPR move, we can just store the source
register instead. If filling the source of a FPR<->GPR move, we can just load
the destination register instead. This avoids the fmv instruction
because a GPR or FPR load/store can be used directly.

AArch64 and SystemZ implement a similar optimisation.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp   | 37 ++++++++++++++++++++--
 llvm/test/CodeGen/RISCV/spill-fill-fold.ll | 36 +++++++--------------
 2 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 857e8979762cdc..1e7dd94900495c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -763,6 +763,37 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     VirtRegMap *VRM) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
+  if (Ops.size() != 1)
+    return nullptr;
+  unsigned Opcode = MI.getOpcode();
+
+  // If spilling the destination of a FPR<->GPR move, just store the source
+  // register instead. If filling the source of a FPR<->GPR move, just load
+  // the destination register instead.
+  if (Opcode == RISCV::FMV_D_X || Opcode == RISCV::FMV_W_X ||
+      Opcode == RISCV::FMV_X_D || Opcode == RISCV::FMV_X_W) {
+    bool IsSpill = Ops[0] == 0;
+    const MachineOperand &DstMO = MI.getOperand(0);
+    const MachineOperand &SrcMO = MI.getOperand(1);
+    const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    MachineBasicBlock &MBB = *MI.getParent();
+    Register DstReg = DstMO.getReg();
+    Register SrcReg = SrcMO.getReg();
+
+    auto getRegClass = [&](unsigned Reg) {
+      return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
+                                              : TRI.getMinimalPhysRegClass(Reg);
+    };
+    if (IsSpill)
+      storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
+                          getRegClass(SrcReg), &TRI, Register());
+    else
+      loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
+                           getRegClass(DstReg), &TRI, Register());
+    return &*--InsertPt;
+  }
+
   // The below optimizations narrow the load so they are only valid for little
   // endian.
   // TODO: Support big endian by adding an offset into the frame object?
@@ -770,11 +801,11 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   // Fold load from stack followed by sext.b/sext.h/sext.w/zext.b/zext.h/zext.w.
-  if (Ops.size() != 1 || Ops[0] != 1)
-   return nullptr;
+  if (Ops[0] != 1)
+    return nullptr;
 
   unsigned LoadOpc;
-  switch (MI.getOpcode()) {
+  switch (Opcode) {
   default:
     if (RISCV::isSEXT_W(MI)) {
       LoadOpc = RISCV::LW;
diff --git a/llvm/test/CodeGen/RISCV/spill-fill-fold.ll b/llvm/test/CodeGen/RISCV/spill-fill-fold.ll
index a9a0cc5cf94d85..54678eaa2b58b0 100644
--- a/llvm/test/CodeGen/RISCV/spill-fill-fold.ll
+++ b/llvm/test/CodeGen/RISCV/spill-fill-fold.ll
@@ -36,8 +36,7 @@ define float @spill_i32_to_float(i32 %a) nounwind {
 ; RV32ID-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    fsw fa5, 4(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    #APP
 ; RV32ID-NEXT:    #NO_APP
 ; RV32ID-NEXT:    flw fa0, 4(sp) # 4-byte Folded Reload
@@ -97,8 +96,7 @@ define float @spill_i32_to_float(i32 %a) nounwind {
 ; RV64ID-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    fsw fa5, 4(sp) # 4-byte Folded Spill
+; RV64ID-NEXT:    sd a0, 4(sp) # 4-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
 ; RV64ID-NEXT:    flw fa0, 4(sp) # 4-byte Folded Reload
@@ -163,8 +161,7 @@ define i32 @spill_float_to_i32(float %a) nounwind {
 ; RV32ID-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
-; RV32ID-NEXT:    fmv.x.w a0, fa0
-; RV32ID-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    fsw fa0, 4(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    #APP
 ; RV32ID-NEXT:    #NO_APP
 ; RV32ID-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
@@ -224,8 +221,7 @@ define i32 @spill_float_to_i32(float %a) nounwind {
 ; RV64ID-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
+; RV64ID-NEXT:    fsw fa0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
 ; RV64ID-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
@@ -353,8 +349,7 @@ define double @spill_i64_to_double(i64 %a) nounwind {
 ; RV64ID-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT:    fmv.d.x fa5, a0
-; RV64ID-NEXT:    fsd fa5, 0(sp) # 8-byte Folded Spill
+; RV64ID-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
 ; RV64ID-NEXT:    fld fa0, 0(sp) # 8-byte Folded Reload
@@ -484,8 +479,7 @@ define i64 @spill_double_to_i64(double %a) nounwind {
 ; RV64ID-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT:    fmv.x.d a0, fa0
-; RV64ID-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
+; RV64ID-NEXT:    fsd fa0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
 ; RV64ID-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
@@ -553,8 +547,7 @@ define float @fill_i32_to_float(i32 %a) nounwind {
 ; RV32ID-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    #APP
 ; RV32ID-NEXT:    #NO_APP
-; RV32ID-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    fmv.w.x fa0, a0
+; RV32ID-NEXT:    flw fa0, 4(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
@@ -614,8 +607,7 @@ define float @fill_i32_to_float(i32 %a) nounwind {
 ; RV64ID-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
-; RV64ID-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64ID-NEXT:    fmv.w.x fa0, a0
+; RV64ID-NEXT:    flw fa0, 0(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
@@ -680,8 +672,7 @@ define i32 @fill_float_to_i32(float %a) nounwind {
 ; RV32ID-NEXT:    fsw fa0, 4(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    #APP
 ; RV32ID-NEXT:    #NO_APP
-; RV32ID-NEXT:    flw fa5, 4(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    fmv.x.w a0, fa5
+; RV32ID-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
@@ -741,8 +732,7 @@ define i32 @fill_float_to_i32(float %a) nounwind {
 ; RV64ID-NEXT:    fsw fa0, 4(sp) # 4-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
-; RV64ID-NEXT:    flw fa5, 4(sp) # 4-byte Folded Reload
-; RV64ID-NEXT:    fmv.x.w a0, fa5
+; RV64ID-NEXT:    ld a0, 4(sp) # 4-byte Folded Reload
 ; RV64ID-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
@@ -870,8 +860,7 @@ define double @fill_i64_to_double(i64 %a) nounwind {
 ; RV64ID-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
-; RV64ID-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64ID-NEXT:    fmv.d.x fa0, a0
+; RV64ID-NEXT:    fld fa0, 0(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
@@ -999,8 +988,7 @@ define i64 @fill_double_to_i64(double %a) nounwind {
 ; RV64ID-NEXT:    fsd fa0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    #APP
 ; RV64ID-NEXT:    #NO_APP
-; RV64ID-NEXT:    fld fa5, 0(sp) # 8-byte Folded Reload
-; RV64ID-NEXT:    fmv.x.d a0, fa5
+; RV64ID-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload