[llvm] [ARM] Allow spilling FPSCR for MVE adc/sbc intrinsics (PR #115174)
Oliver Stannard via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 07:28:22 PST 2024
https://github.com/ostannard created https://github.com/llvm/llvm-project/pull/115174
The MVE VADC and VSBC instructions read and write a carry bit in FPSCR, which is exposed through the intrinsics. This makes it possible to write code which has the FPSCR live across a function call, or which uses the same value twice, so it needs to be possible to spill and reload it.
There is a missed optimisation in one of the test cases, where we reload the FPSCR from the stack despite it still being live, I've not found a simple way to prevent the register allocator from doing this.
>From e1bdf834db9e3568983e77030ad75a31618a4a4a Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Wed, 6 Nov 2024 15:15:23 +0000
Subject: [PATCH] [ARM] Allow spilling FPSCR for MVE adc/sbc intrinsics
The MVE VADC and VSBC instructions read and write a carry bit in FPSCR,
which is exposed through the intrinsics. This makes it possible to write
code which has the FPSCR live across a function call, or which uses the
same value twice, so it needs to be possible to spill and reload it.
There is a missed optimisation in one of the test cases, where we reload
the FPSCR from the stack despite it still being live, I've not found a
simple way to prevent the register allocator from doing this.
---
llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 15 +++
llvm/lib/Target/ARM/ARMInstrVFP.td | 13 ++-
.../CodeGen/Thumb2/mve-vadc-vsbc-spill.ll | 106 ++++++++++++++++++
3 files changed, 130 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 49d1f02a2f6913..dea295d0f237f0 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1163,6 +1163,13 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addImm(0)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
+ } else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_FPSCR_NZCVQC_off))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO)
+ .add(predOps(ARMCC::AL));
} else
llvm_unreachable("Unknown reg class!");
break;
@@ -1326,6 +1333,7 @@ Register ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
case ARM::VSTRD:
case ARM::VSTRS:
case ARM::VSTR_P0_off:
+ case ARM::VSTR_FPSCR_NZCVQC_off:
case ARM::MVE_VSTRWU32:
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
MI.getOperand(2).getImm() == 0) {
@@ -1417,6 +1425,12 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
.addImm(0)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
+ } else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(ARM::VLDR_FPSCR_NZCVQC_off), DestReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO)
+ .add(predOps(ARMCC::AL));
} else
llvm_unreachable("Unknown reg class!");
break;
@@ -1577,6 +1591,7 @@ Register ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
case ARM::VLDRD:
case ARM::VLDRS:
case ARM::VLDR_P0_off:
+ case ARM::VLDR_FPSCR_NZCVQC_off:
case ARM::MVE_VLDRWU32:
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
MI.getOperand(2).getImm() == 0) {
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 8d35e0794e98e9..5b49f728ebb8d8 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -2894,9 +2894,8 @@ multiclass vfp_vstrldr_sysreg<bit opc, bits<4> SysReg, string sysreg,
}
}
-let Defs = [FPSCR] in {
+let Uses = [FPSCR] in {
defm VSTR_FPSCR : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">;
- defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">;
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
defm VSTR_FPCXTNS : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">;
@@ -2918,12 +2917,18 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
(outs VCCR:$P0), (ins)>;
}
-let Uses = [FPSCR] in {
+let Defs = [FPSCR] in {
defm VLDR_FPSCR : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">;
- defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">;
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
defm VLDR_FPCXTNS : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">;
defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
}
}
+
+defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc",
+ (outs), (ins cl_FPSCR_NZCV:$fpscr)>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc",
+ (outs cl_FPSCR_NZCV:$fpscr), (ins)>;
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
new file mode 100644
index 00000000000000..7fe8e94589a0c5
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple thumbv8.1m.main-arm-none-eabihf -mattr=+mve | FileCheck %s
+
+declare void @use_int32x4_t(<4 x i32>)
+
+; A 256-bit addition, with the two halves of the result passed to function
+; calls to spill the carry bit out of FPSCR.
+define void @add_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
+; CHECK-LABEL: add_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: vadci.i32 q0, q0, q2
+; CHECK-NEXT: vmov q4, q3
+; CHECK-NEXT: vmov q5, q1
+; CHECK-NEXT: vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: bl use_int32x4_t
+; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vadc.i32 q0, q5, q4
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: pop.w {r7, lr}
+; CHECK-NEXT: b use_int32x4_t
+entry:
+ %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+ %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+ %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+ tail call void @use_int32x4_t(<4 x i32> %result_low)
+ %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+ %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+ tail call void @use_int32x4_t(<4 x i32> %result_high)
+ ret void
+}
+
+; A 256-bit subtraction, with the two halves of the result passed to function
+; calls to spill the carry bit out of FPSCR.
+define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
+; CHECK-LABEL: sub_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: vsbci.i32 q0, q0, q2
+; CHECK-NEXT: vmov q4, q3
+; CHECK-NEXT: vmov q5, q1
+; CHECK-NEXT: vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: bl use_int32x4_t
+; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vsbc.i32 q0, q5, q4
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: pop.w {r7, lr}
+; CHECK-NEXT: b use_int32x4_t
+entry:
+ %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+ %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+ %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+ tail call void @use_int32x4_t(<4 x i32> %result_low)
+ %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+ %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+ tail call void @use_int32x4_t(<4 x i32> %result_high)
+ ret void
+}
+
+; The carry-out of the first VADC intrinsic call is used by two other VADCs,
+; both of which will modify FPSCR, so it must be spilled and reloaded.
+; Missed optimisation: the first VLDR isn't needed, because the carry bit is
+; already in FPSCR.
+define <4 x i32> @multiple_uses_of_carry_bit(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high, <4 x i32> %a_high_2, <4 x i32> %b_high_2) {
+; CHECK-LABEL: multiple_uses_of_carry_bit:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: vadci.i32 q0, q0, q2
+; CHECK-NEXT: add r0, sp, #24
+; CHECK-NEXT: vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vadc.i32 q1, q1, q3
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: add r0, sp, #8
+; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vadc.i32 q1, q2, q1
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: bx lr
+entry:
+ %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+ %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+ %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+ %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+ %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+ %checksum_1 = xor <4 x i32> %result_low, %result_high
+ %adc_high_2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high_2, <4 x i32> %b_high_2, i32 %carry)
+ %result_high_2 = extractvalue { <4 x i32>, i32 } %adc_high_2, 0
+ %checksum_2 = xor <4 x i32> %checksum_1, %result_high_2
+ ret <4 x i32> %checksum_2
+}
More information about the llvm-commits
mailing list