[llvm] [ARM] Allow spilling FPSCR for MVE adc/sbc intrinsics (PR #115174)

Wed Nov 6 07:28:22 PST 2024

https://github.com/ostannard created https://github.com/llvm/llvm-project/pull/115174

The MVE VADC and VSBC instructions read and write a carry bit in FPSCR, which is exposed through the intrinsics. This makes it possible to write code which has the FPSCR live across a function call, or which uses the same value twice, so it needs to be possible to spill and reload it.

There is a missed optimisation in one of the test cases, where we reload the FPSCR from the stack despite it still being live, I've not found a simple way to prevent the register allocator from doing this.

>From e1bdf834db9e3568983e77030ad75a31618a4a4a Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Wed, 6 Nov 2024 15:15:23 +0000
Subject: [PATCH] [ARM] Allow spilling FPSCR for MVE adc/sbc intrinsics

The MVE VADC and VSBC instructions read and write a carry bit in FPSCR,
which is exposed through the intrinsics. This makes it possible to write
code which has the FPSCR live across a function call, or which uses the
same value twice, so it needs to be possible to spill and reload it.

There is a missed optimisation in one of the test cases, where we reload
the FPSCR from the stack despite it still being live, I've not found a
simple way to prevent the register allocator from doing this.
---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      |  15 +++
 llvm/lib/Target/ARM/ARMInstrVFP.td            |  13 ++-
 .../CodeGen/Thumb2/mve-vadc-vsbc-spill.ll     | 106 ++++++++++++++++++
 3 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 49d1f02a2f6913..dea295d0f237f0 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1163,6 +1163,13 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
             .addImm(0)
             .addMemOperand(MMO)
             .add(predOps(ARMCC::AL));
+      } else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
+        BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_FPSCR_NZCVQC_off))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else
         llvm_unreachable("Unknown reg class!");
       break;
@@ -1326,6 +1333,7 @@ Register ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case ARM::VSTRD:
   case ARM::VSTRS:
   case ARM::VSTR_P0_off:
+  case ARM::VSTR_FPSCR_NZCVQC_off:
   case ARM::MVE_VSTRWU32:
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0) {
@@ -1417,6 +1425,12 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
           .addImm(0)
           .addMemOperand(MMO)
           .add(predOps(ARMCC::AL));
+    } else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(ARM::VLDR_FPSCR_NZCVQC_off), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
     } else
       llvm_unreachable("Unknown reg class!");
     break;
@@ -1577,6 +1591,7 @@ Register ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::VLDR_P0_off:
+  case ARM::VLDR_FPSCR_NZCVQC_off:
   case ARM::MVE_VLDRWU32:
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0) {
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 8d35e0794e98e9..5b49f728ebb8d8 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -2894,9 +2894,8 @@ multiclass vfp_vstrldr_sysreg<bit opc, bits<4> SysReg, string sysreg,
  }
 }
 
-let Defs = [FPSCR] in {
+let Uses = [FPSCR] in {
   defm VSTR_FPSCR          : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">;
-  defm VSTR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">;
 
   let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
     defm VSTR_FPCXTNS      : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">;
@@ -2918,12 +2917,18 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
                                                 (outs VCCR:$P0), (ins)>;
 }
 
-let Uses = [FPSCR] in {
+let Defs = [FPSCR] in {
   defm VLDR_FPSCR          : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">;
-  defm VLDR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">;
 
   let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
     defm VLDR_FPCXTNS      : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">;
     defm VLDR_FPCXTS       : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
   }
 }
+
+defm VSTR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc",
+                                              (outs), (ins cl_FPSCR_NZCV:$fpscr)>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+defm VLDR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc",
+                                              (outs cl_FPSCR_NZCV:$fpscr), (ins)>;
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
new file mode 100644
index 00000000000000..7fe8e94589a0c5
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple thumbv8.1m.main-arm-none-eabihf -mattr=+mve | FileCheck %s
+
+declare void @use_int32x4_t(<4 x i32>)
+
+; A 256-bit addition, with the two halves of the result passed to function
+; calls to spill the carry bit out of FPSCR.
+define void @add_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
+; CHECK-LABEL: add_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vadci.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    bl use_int32x4_t
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vadc.i32 q0, q5, q4
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:    b use_int32x4_t
+entry:
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+  %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_low)
+  %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+  %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_high)
+  ret void
+}
+
+; A 256-bit subtraction, with the two halves of the result passed to function
+; calls to spill the carry bit out of FPSCR.
+define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
+; CHECK-LABEL: sub_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vsbci.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    bl use_int32x4_t
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vsbc.i32 q0, q5, q4
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:    b use_int32x4_t
+entry:
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+  %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_low)
+  %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+  %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_high)
+  ret void
+}
+
+; The carry-out of the first VADC intrinsic call is used by two other VADCs,
+; both of which will modify FPSCR, so it must be spilled and reloaded.
+; Missed optimisation: the first VLDR isn't needed, because the carry bit is
+; already in FPSCR.
+define <4 x i32> @multiple_uses_of_carry_bit(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high, <4 x i32> %a_high_2, <4 x i32> %b_high_2) {
+; CHECK-LABEL: multiple_uses_of_carry_bit:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vadci.i32 q0, q0, q2
+; CHECK-NEXT:    add r0, sp, #24
+; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vadc.i32 q1, q1, q3
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    add r0, sp, #8
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadc.i32 q1, q2, q1
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+  %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+  %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+  %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+  %checksum_1 = xor <4 x i32> %result_low, %result_high
+  %adc_high_2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high_2, <4 x i32> %b_high_2, i32 %carry)
+  %result_high_2 = extractvalue { <4 x i32>, i32 } %adc_high_2, 0
+  %checksum_2 = xor <4 x i32> %checksum_1, %result_high_2
+  ret <4 x i32> %checksum_2
+}