[llvm] [AArch64] Remove redundant instructions in int-to-fp of lowest vector… (PR #98602)

Sat Jul 13 07:01:21 PDT 2024

https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/98602

>From 12da54e1f2181be1095ef51309cad4f98e7a53d7 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 12 Jul 2024 08:34:01 +0000
Subject: [PATCH 1/2] [AArch64] Remove redundant instructions in int-to-fp of
 lowest vector element

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  26 +++
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  15 +-
 .../AArch64/fixed-point-conv-vec-pat.ll       | 158 ++++++++++++++++++
 .../AArch64/sve-fixed-length-int-to-fp.ll     |   6 +-
 4 files changed, 191 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 78c8bf1e323ab..8659a48499f24 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6068,6 +6068,32 @@ def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
 def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
           (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
 }
+
+// int -> float conversion of value in lane 0 of simd vector should use 
+// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
+def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
+          (SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
+
+def : Pat<(f32 (uint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
+          (UCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
+
+def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
+          (SCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+
+def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
+          (UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+
+// fp16: integer extraction from vector must be at least 32-bits to be legal.
+// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
+let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), i16)))),
+          (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+
+// unsigned 32-bit extracted element is truncated to 16-bits using AND
+def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
+          (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+}
+
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 43d5ab5ab54e1..c56f4409e3a62 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1127,8 +1127,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov8i8:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg v0.8b, v0.8b
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1147,8 +1146,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov4i16:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg v0.4h, v0.4h
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1167,8 +1165,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov2i32:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg v0.2s, v0.2s
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1187,8 +1184,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
 ; CHECK-SD-LABEL: test_bitcastv1f64tov1i64:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    scvtf d0, x8
+; CHECK-SD-NEXT:    scvtf d0, d0
 ; CHECK-SD-NEXT:    neg d0, d0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1209,8 +1205,7 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    fneg v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
index dff216192a6c3..ab7c2afbbf871 100644
--- a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
@@ -101,4 +101,162 @@ define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
   ret <8 x half> %v
 }
 
+; int-to-fp conversion of element in lane 0 should apply
+; cvtf on vector subregister to avoid fpr->gpr trip
+define float @l0_extract_f_v2s(<2 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i32> %u, i64 0
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; cvtf to use ssub for bottom 32-bits from v2i32
+define float @l0_extract_f_v2u(<2 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v2u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i32> %u, i64 0
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+; Pattern should only apply when it is known to be lane 0
+define float @ln_extract_f_v2s(<2 x i32> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_f_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x0, #2, #1
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i32> %u, i64 %n
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; cvtf to use ssub for bottom 32-bits from v4i32
+define float @l0_extract_f_v4s(<4 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <4 x i32> %u, i64 0
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+define float @l0_extract_f_v4u(<4 x i32> %u) {
+; CHECK-LABEL: l0_extract_f_v4u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+  %i = extractelement <4 x i32> %u, i64 0
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+define float @ln_extract_f_v4s(<4 x i32> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_f_v4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <4 x i32> %u, i64 %n
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; cvtf to use dsub for bottom 64-bits from v2i64
+define double @l0_extract_d_v2s(<2 x i64> %u) {
+; CHECK-LABEL: l0_extract_d_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i64> %u, i64 0
+  %f = sitofp i64 %i to double
+  ret double %f
+}
+
+define double @l0_extract_d_v2u(<2 x i64> %u) {
+; CHECK-LABEL: l0_extract_d_v2u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i64> %u, i64 0
+  %f = uitofp i64 %i to double
+  ret double %f
+}
+
+define double @ln_extract_d_v2s(<2 x i64> %u, i64 %n) {
+; CHECK-LABEL: ln_extract_d_v2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #3, #1
+; CHECK-NEXT:    ldr d0, [x8]
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <2 x i64> %u, i64 %n
+  %f = sitofp i64 %i to double
+  ret double %f
+}
+
+; (fullfp16) cvtf to use hsub for bottom 16-bits from v8i16
+define half @l0_extract_h_v8s(<8 x i16> %u) #0 {
+; CHECK-LABEL: l0_extract_h_v8s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf h0, h0
+; CHECK-NEXT:    ret
+  %i = extractelement <8 x i16> %u, i32 0
+  %f = sitofp i16 %i to half
+  ret half %f
+}
+
+define half @l0_extract_h_v8u(<8 x i16> %u) #0 {
+; CHECK-LABEL: l0_extract_h_v8u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf h0, h0
+; CHECK-NEXT:    ret
+  %i = extractelement <8 x i16> %u, i32 0
+  %f = uitofp i16 %i to half
+  ret half %f
+}
+
+define half @ln_extract_h_v8u(<8 x i16> %u, i32 %n) #0 {
+; CHECK-LABEL: ln_extract_h_v8u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #1, #3
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %i = extractelement <8 x i16> %u, i32 %n
+  %f = uitofp i16 %i to half
+  ret half %f
+}
+
 attributes #0 = { "target-features"="+fullfp16"}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 5bb012ae57503..573fe3d8b8a77 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -827,8 +827,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
   %res = uitofp <1 x i64> %op1 to <1 x double>
   ret <1 x double> %res
@@ -1752,8 +1751,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    ret
   %res = sitofp <1 x i64> %op1 to <1 x double>
   ret <1 x double> %res

>From 62950ebf90010aa04f8d2d9bc7f3fb4b0353ee06 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Sat, 13 Jul 2024 13:56:58 +0000
Subject: [PATCH 2/2] Reformat fp16 patterns to character limit

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8659a48499f24..df4d6f6db5a6c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6086,12 +6086,14 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
 // fp16: integer extraction from vector must be at least 32-bits to be legal.
 // Actual extraction result is then an in-reg sign-extension of lower 16-bits.
 let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), i16)))),
-          (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract 
+          (v8i16 FPR128:$Rn), (i64 0))), i16)))), 
+              (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
 
 // unsigned 32-bit extracted element is truncated to 16-bits using AND
-def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
-          (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
+def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract 
+          (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
+              (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
 }
 
 // If an integer is about to be converted to a floating point value,