[PATCH] D96700: [llvm][Aarch64][SVE] Remove extra fmov instruction with certain literals

Mon Feb 15 04:09:35 PST 2021

DavidTruby created this revision.
Herald added subscribers: psnobl, hiraditya, kristof.beyls, tschuett.
Herald added a reviewer: efriedma.
DavidTruby requested review of this revision.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

When a literal that cannot fit in the immediate form of the fmov instruction
is used to initialise an SVE vector, an extra unnecessary fmov is currently
generated. This patch adds an extra codegen pattern preventing the extra
instruction from being generated.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D96700

Files:
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
  llvm/test/CodeGen/AArch64/sve-vector-splat.ll


Index: llvm/test/CodeGen/AArch64/sve-vector-splat.ll
===================================================================

--- llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -372,5 +372,24 @@
   ret <vscale x 4 x float> %r
 }
 
+define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
+; CHECK-LABEL: splat_nxv2f32_fmov_fold
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %1 = insertelement <vscale x 2 x float> undef, float 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %2
+}
+
+define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
+; CHECK-LABEL: splat_nxv4f32_fmov_fold
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %1 = insertelement <vscale x 4 x float> undef, float 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %2
+}
+
+
 ; +bf16 is required for the bfloat version.
 attributes #0 = { "target-features"="+sve,+bf16" }
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@@ -130,12 +130,30 @@
   ret <vscale x 2 x double> %out
 }
 
+define <vscale x 2 x float> @dup_fmov_imm_f32_2() {
+; CHECK-LABEL: dup_fmov_imm_f32_2:
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %out = tail call <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float 4.200000e+01)
+  ret <vscale x 2 x float> %out
+}
+
+
+define <vscale x 4 x float> @dup_fmov_imm_f32_4() {
+; CHECK-LABEL: dup_fmov_imm_f32_4:
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %out = tail call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 4.200000e+01)
+  ret <vscale x 4 x float> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8( i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
+declare <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
 
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -556,6 +556,12 @@
   def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
             (DUP_ZI_D $a, $b)>;
 
+  // Duplicate immediate FP into all vector elements.
+  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+            (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+            (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+
   // Duplicate FP immediate into all vector elements
   let AddedComplexity = 2 in {
     def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D96700.323711.patch
Type: text/x-patch
Size: 3393 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210215/a8efabda/attachment.bin>