[llvm] [AArch64][SVE] Avoid transfer to GPRs for fp -> int -> fp conversions (PR #112564)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 16 08:31:30 PDT 2024
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/112564
>From 476c0cc7cdbaed16e5454828085f675babfe6f02 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 16 Oct 2024 14:45:43 +0000
Subject: [PATCH 1/2] [AArch64][SVE] Avoid transfer to GPRs for fp -> int -> fp
conversions
When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF,
and SCVTF for fp -> int -> fp conversions to avoid moving values
to/from GPRs which may be expensive.
Note: With +sme2p2 the single-element vector Neon variants of these
instructions could be used instead (but that feature is not implemented
yet).
Follow up to #112213.
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 35 ++++++++
.../sve-streaming-mode-cvt-fp-int-fp.ll | 89 +++++++++++++++----
2 files changed, 107 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2a857234c7d745..19dc2016f9fcf7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2421,6 +2421,41 @@ let Predicates = [HasSVEorSME] in {
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>;
} // End HasSVEorSME
+// Helper for creating fp -> int -> fp conversions using SVE.
+class sve_fp_int_fp_cvt<Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
+ : OutPatFrag<(ops node: $Rn),
+ (EXTRACT_SUBREG
+ (FROM_INT (IMPLICIT_DEF), (PTRUE 1),
+ (TO_INT (IMPLICIT_DEF), (PTRUE 1),
+ (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
+
+// Some float -> int -> float conversion patterns where we want to keep the int
+// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR
+// register transfers. Only used when NEON is not available (e.g. in streaming
+// functions).
+// TODO: When +sme2p2 is available single-element vectors should be preferred.
+def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
+let Predicates = [HasSVEorSME, HasNoNEON] in {
+def : Pat<
+ (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+ (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+ (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+ (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+ (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
+def : Pat<
+ (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
+} // End HasSVEorSME, HasNoNEON
+
let Predicates = [HasBF16, HasSVEorSME] in {
defm BFDOT_ZZZ : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
defm BFDOT_ZZI : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index 9aadf3133ba197..fbbe2cc64ad248 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
target triple = "aarch64-unknown-linux-gnu"
@@ -7,10 +8,19 @@ target triple = "aarch64-unknown-linux-gnu"
define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: scvtf d0, x8
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t1:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzs x8, d0
+; NONEON-NOSVE-NEXT: scvtf d0, x8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t1:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzs d0, d0
@@ -25,10 +35,19 @@ entry:
define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs w8, s0
-; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t2:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzs w8, s0
+; NONEON-NOSVE-NEXT: scvtf s0, w8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t2:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzs s0, s0
@@ -43,12 +62,21 @@ entry:
define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: fcvtzs w8, s0
-; CHECK-NEXT: scvtf s0, w8
-; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ptrue p0.h, vl1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t3:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvtzs w8, s0
+; NONEON-NOSVE-NEXT: scvtf s0, w8
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t3:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvt s0, h0
@@ -65,10 +93,19 @@ entry:
define double @t4(double %x) {
; CHECK-LABEL: t4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu x8, d0
-; CHECK-NEXT: ucvtf d0, x8
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t4:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzu x8, d0
+; NONEON-NOSVE-NEXT: ucvtf d0, x8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t4:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzu d0, d0
@@ -83,10 +120,19 @@ entry:
define float @t5(float %x) {
; CHECK-LABEL: t5:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu w8, s0
-; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t5:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzu w8, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t5:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzu s0, s0
@@ -101,12 +147,21 @@ entry:
define half @t6(half %x) {
; CHECK-LABEL: t6:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: fcvtzu w8, s0
-; CHECK-NEXT: ucvtf s0, w8
-; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ptrue p0.h, vl1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t6:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvtzu w8, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t6:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvt s0, h0
>From 60c48e6f37eb673d9301d9865a3df8309b91d91f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 16 Oct 2024 15:22:57 +0000
Subject: [PATCH 2/2] Note that this is for scalars
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 27 ++++++++++---------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 19dc2016f9fcf7..078ea43a76f31d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2421,39 +2421,40 @@ let Predicates = [HasSVEorSME] in {
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>;
} // End HasSVEorSME
-// Helper for creating fp -> int -> fp conversions using SVE.
-class sve_fp_int_fp_cvt<Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
+// Helper for creating scalar fp -> int -> fp conversions using SVE.
+class sve_scalar_fp_int_fp_cvt
+ <Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
: OutPatFrag<(ops node: $Rn),
(EXTRACT_SUBREG
(FROM_INT (IMPLICIT_DEF), (PTRUE 1),
(TO_INT (IMPLICIT_DEF), (PTRUE 1),
(INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
-// Some float -> int -> float conversion patterns where we want to keep the int
-// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR
-// register transfers. Only used when NEON is not available (e.g. in streaming
-// functions).
-// TODO: When +sme2p2 is available single-element vectors should be preferred.
+// Some scalar float -> int -> float conversion patterns where we want to keep
+// the int values in FP registers to avoid costly GPR <-> FPR register
+// transfers using SVE instructions. Only used when NEON is not available (e.g.
+// in streaming functions).
+// TODO: When +sme2p2 is available Neon single-element vectors should be preferred.
def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
let Predicates = [HasSVEorSME, HasNoNEON] in {
def : Pat<
(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
- (sve_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
+ (sve_scalar_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
def : Pat<
(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
- (sve_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
+ (sve_scalar_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
def : Pat<
(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
- (sve_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
+ (sve_scalar_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
def : Pat<
(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
- (sve_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
+ (sve_scalar_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
def : Pat<
(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
- (sve_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
+ (sve_scalar_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
def : Pat<
(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
- (sve_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
+ (sve_scalar_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
} // End HasSVEorSME, HasNoNEON
let Predicates = [HasBF16, HasSVEorSME] in {
More information about the llvm-commits
mailing list