[llvm] e86f9ba - [llvm][Aarch64][SVE] Remove extra fmov instruction with certain literals

Tue Feb 16 06:17:03 PST 2021

Author: David Truby
Date: 2021-02-16T14:16:33Z
New Revision: e86f9ba15c41492c4ff9cd860136f69cfb4e17d2

URL: https://github.com/llvm/llvm-project/commit/e86f9ba15c41492c4ff9cd860136f69cfb4e17d2
DIFF: https://github.com/llvm/llvm-project/commit/e86f9ba15c41492c4ff9cd860136f69cfb4e17d2.diff

LOG: [llvm][Aarch64][SVE] Remove extra fmov instruction with certain literals

When a literal that cannot fit in the immediate form of the fmov instruction
is used to initialise an SVE vector, an extra unnecessary fmov is currently
generated. This patch adds an extra codegen pattern preventing the extra
instruction from being generated.

Differential Revision: https://reviews.llvm.org/D96700

Co-Authored-By: Paul Walker <paul.walker at arm.com>

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
    llvm/test/CodeGen/AArch64/sve-vector-splat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e562b1efa10f..95ea9297e7ac 100644

--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -553,6 +553,14 @@ let Predicates = [HasSVE] in {
   def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
             (DUP_ZI_D $a, $b)>;
 
+  // Duplicate immediate FP into all vector elements.
+ def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+            (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+ def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+            (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+ def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
+            (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
+
   // Duplicate FP immediate into all vector elements
   let AddedComplexity = 2 in {
     def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
index 9df1635e4153..71546e458a23 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@@ -130,12 +130,37 @@ define <vscale x 2 x double> @dup_imm_f64(double %b) {
   ret <vscale x 2 x double> %out
 }
 
+define <vscale x 2 x float> @dup_fmov_imm_f32_2() {
+; CHECK-LABEL: dup_fmov_imm_f32_2:
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %out = tail call <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float 4.200000e+01)
+  ret <vscale x 2 x float> %out
+}
+
+define <vscale x 4 x float> @dup_fmov_imm_f32_4() {
+; CHECK-LABEL: dup_fmov_imm_f32_4:
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %out = tail call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 4.200000e+01)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @dup_fmov_imm_f64_2() {
+; CHECK-LABEL: dup_fmov_imm_f64_2:
+; CHECK: mov x8, #4631107791820423168
+; CHECK-NEXT: mov z0.d, x8
+  %out = tail call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 4.200000e+01)
+  ret <vscale x 2 x double> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8( i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
+declare <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
 

diff  --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
index be8ec87e7a56..d1ec1117cd35 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -372,5 +372,32 @@ define <vscale x 4 x float> @splat_nxv4f32_fold(<vscale x 4 x float> %x) {
   ret <vscale x 4 x float> %r
 }
 
+define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
+; CHECK-LABEL: splat_nxv2f32_fmov_fold
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %1 = insertelement <vscale x 2 x float> undef, float 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %2
+}
+
+define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
+; CHECK-LABEL: splat_nxv4f32_fmov_fold
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %1 = insertelement <vscale x 4 x float> undef, float 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @splat_nxv2f64_fmov_fold() {
+; CHECK-LABEL: splat_nxv2f64_fmov_fold
+; CHECK: mov x8, #4631107791820423168
+; CHECK-NEXT: mov z0.d, x8
+  %1 = insertelement <vscale x 2 x double> undef, double 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %2
+}
+
 ; +bf16 is required for the bfloat version.
 attributes #0 = { "target-features"="+sve,+bf16" }