[llvm] [AArch64] Remove redundant instructions in int-to-fp of lowest vector… (PR #98602)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 13 07:01:21 PDT 2024
https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/98602
>From 12da54e1f2181be1095ef51309cad4f98e7a53d7 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 12 Jul 2024 08:34:01 +0000
Subject: [PATCH 1/2] [AArch64] Remove redundant instructions in int-to-fp of
lowest vector element
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 26 +++
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 15 +-
.../AArch64/fixed-point-conv-vec-pat.ll | 158 ++++++++++++++++++
.../AArch64/sve-fixed-length-int-to-fp.ll | 6 +-
4 files changed, 191 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 78c8bf1e323ab..8659a48499f24 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6068,6 +6068,32 @@ def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
+
+// int -> float conversion of value in lane 0 of simd vector should use
+// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
+def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
+ (SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
+
+def : Pat<(f32 (uint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
+ (UCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
+
+def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
+ (SCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+
+def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
+ (UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+
+// fp16: integer extraction from vector must be at least 32-bits to be legal.
+// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
+let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), i16)))),
+ (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+
+// unsigned 32-bit extracted element is truncated to 16-bits using AND
+def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
+ (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+}
+
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 43d5ab5ab54e1..c56f4409e3a62 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1127,8 +1127,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov8i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: fmov x8, d0
-; CHECK-SD-NEXT: scvtf d0, x8
+; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg v0.8b, v0.8b
; CHECK-SD-NEXT: ret
;
@@ -1147,8 +1146,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov4i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: fmov x8, d0
-; CHECK-SD-NEXT: scvtf d0, x8
+; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg v0.4h, v0.4h
; CHECK-SD-NEXT: ret
;
@@ -1167,8 +1165,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov2i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: fmov x8, d0
-; CHECK-SD-NEXT: scvtf d0, x8
+; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg v0.2s, v0.2s
; CHECK-SD-NEXT: ret
;
@@ -1187,8 +1184,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov1i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: fmov x8, d0
-; CHECK-SD-NEXT: scvtf d0, x8
+; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg d0, d0
; CHECK-SD-NEXT: ret
;
@@ -1209,8 +1205,7 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: scvtf d0, x8
+; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: fneg v0.2s, v0.2s
; CHECK-NEXT: ret
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
index dff216192a6c3..ab7c2afbbf871 100644
--- a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
@@ -101,4 +101,162 @@ define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
ret <8 x half> %v
}
+; int-to-fp conversion of element in lane 0 should apply
+; cvtf on vector subregister to avoid fpr->gpr trip
+define float @l0_extract_f_v2s(<2 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: ret
+ %i = extractelement <2 x i32> %u, i64 0
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+; cvtf to use ssub for bottom 32-bits from v2i32
+define float @l0_extract_f_v2u(<2 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v2u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: ucvtf s0, s0
+; CHECK-NEXT: ret
+ %i = extractelement <2 x i32> %u, i64 0
+ %f = uitofp i32 %i to float
+ ret float %f
+}
+
+; Pattern should only apply when it is known to be lane 0
+define float @ln_extract_f_v2s(<2 x i32> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_f_v2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: add x8, sp, #8
+; CHECK-NEXT: str d0, [sp, #8]
+; CHECK-NEXT: bfi x8, x0, #2, #1
+; CHECK-NEXT: ldr s0, [x8]
+; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %i = extractelement <2 x i32> %u, i64 %n
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+; cvtf to use ssub for bottom 32-bits from v4i32
+define float @l0_extract_f_v4s(<4 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: ret
+ %i = extractelement <4 x i32> %u, i64 0
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+define float @l0_extract_f_v4u(<4 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v4u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ucvtf s0, s0
+; CHECK-NEXT: ret
+ %i = extractelement <4 x i32> %u, i64 0
+ %f = uitofp i32 %i to float
+ ret float %f
+}
+
+define float @ln_extract_f_v4s(<4 x i32> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_f_v4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: bfi x8, x0, #2, #2
+; CHECK-NEXT: ldr s0, [x8]
+; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %i = extractelement <4 x i32> %u, i64 %n
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+; cvtf to use dsub for bottom 64-bits from v2i64
+define double @l0_extract_d_v2s(<2 x i64> %u) {
+; CHECK-LABEL: l0_extract_d_v2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: scvtf d0, d0
+; CHECK-NEXT: ret
+ %i = extractelement <2 x i64> %u, i64 0
+ %f = sitofp i64 %i to double
+ ret double %f
+}
+
+define double @l0_extract_d_v2u(<2 x i64> %u) {
+; CHECK-LABEL: l0_extract_d_v2u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ucvtf d0, d0
+; CHECK-NEXT: ret
+ %i = extractelement <2 x i64> %u, i64 0
+ %f = uitofp i64 %i to double
+ ret double %f
+}
+
+define double @ln_extract_d_v2s(<2 x i64> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_d_v2s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: bfi x8, x0, #3, #1
+; CHECK-NEXT: ldr d0, [x8]
+; CHECK-NEXT: scvtf d0, d0
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %i = extractelement <2 x i64> %u, i64 %n
+ %f = sitofp i64 %i to double
+ ret double %f
+}
+
+; (fullfp16) cvtf to use hsub for bottom 16-bits from v8i16
+define half @l0_extract_h_v8s(<8 x i16> %u) #0 {
+; CHECK-LABEL: l0_extract_h_v8s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: scvtf h0, h0
+; CHECK-NEXT: ret
+ %i = extractelement <8 x i16> %u, i32 0
+ %f = sitofp i16 %i to half
+ ret half %f
+}
+
+define half @l0_extract_h_v8u(<8 x i16> %u) #0 {
+; CHECK-LABEL: l0_extract_h_v8u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ucvtf h0, h0
+; CHECK-NEXT: ret
+ %i = extractelement <8 x i16> %u, i32 0
+ %f = uitofp i16 %i to half
+ ret half %f
+}
+
+define half @ln_extract_h_v8u(<8 x i16> %u, i32 %n) #0 {
+; CHECK-LABEL: ln_extract_h_v8u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: bfi x8, x0, #1, #3
+; CHECK-NEXT: ldrh w8, [x8]
+; CHECK-NEXT: ucvtf h0, w8
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %i = extractelement <8 x i16> %u, i32 %n
+ %f = uitofp i16 %i to half
+ ret half %f
+}
+
attributes #0 = { "target-features"="+fullfp16"}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 5bb012ae57503..573fe3d8b8a77 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -827,8 +827,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: ucvtf d0, x8
+; CHECK-NEXT: ucvtf d0, d0
; CHECK-NEXT: ret
%res = uitofp <1 x i64> %op1 to <1 x double>
ret <1 x double> %res
@@ -1752,8 +1751,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: scvtf d0, x8
+; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: ret
%res = sitofp <1 x i64> %op1 to <1 x double>
ret <1 x double> %res
>From 62950ebf90010aa04f8d2d9bc7f3fb4b0353ee06 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Sat, 13 Jul 2024 13:56:58 +0000
Subject: [PATCH 2/2] Reformat fp16 patterns to character limit
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8659a48499f24..df4d6f6db5a6c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6086,12 +6086,14 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), i16)))),
- (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
+ (v8i16 FPR128:$Rn), (i64 0))), i16)))),
+ (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
// unsigned 32-bit extracted element is truncated to 16-bits using AND
-def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
- (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract
+ (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
+ (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
}
// If an integer is about to be converted to a floating point value,
More information about the llvm-commits
mailing list