[llvm] r339232 - [ARM] FP16: vector VMUL variants

Wed Aug 8 03:27:34 PDT 2018

Author: sjoerdmeijer
Date: Wed Aug  8 03:27:34 2018
New Revision: 339232

URL: http://llvm.org/viewvc/llvm-project?rev=339232&view=rev
Log:
[ARM] FP16: vector VMUL variants

This adds codegen support for the vmul_lane_f16 and vmul_n_f16 variants.

Differential Revision: https://reviews.llvm.org/D50326

Modified:
    llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
    llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll

Modified: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=339232&r1=339231&r2=339232&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td Wed Aug  8 03:27:34 2018
@@ -4305,17 +4305,29 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1)
                            (v2f32 (EXTRACT_SUBREG QPR:$src2,
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
-
+def : Pat<(v8f16 (fmul (v8f16 QPR:$src1),
+                       (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))),
+          (v8f16 (VMULslhq(v8f16 QPR:$src1),
+                           (v4f16 (EXTRACT_SUBREG QPR:$src2,
+                                   (DSubReg_i16_reg imm:$lane))),
+                           (SubReg_i16_lane imm:$lane)))>;
 
 def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
           (VMULslfd DPR:$Rn,
             (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
             (i32 0))>;
+def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+          (VMULslhd DPR:$Rn,
+            (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+            (i32 0))>;
 def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
           (VMULslfq QPR:$Rn,
             (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
             (i32 0))>;
-
+def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+          (VMULslhq QPR:$Rn,
+            (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+            (i32 0))>;
 
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
 defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,

Modified: llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll?rev=339232&r1=339231&r2=339232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll Wed Aug  8 03:27:34 2018
@@ -979,43 +979,53 @@ entry:
   ret <8 x half> %0
 }
 
-; FIXME (PR38404)
-;
-;define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) {
-;entry:
-;  %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-;  %mul = fmul <4 x half> %shuffle, %a
-;  ret <4 x half> %mul
-;}
+define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: test_vmul_lane_f16:
+; CHECK:         vmul.f16 d0, d0, d1[3]
+; CHECK-NEXT:    bx lr
+entry:
+  %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = fmul <4 x half> %shuffle, %a
+  ret <4 x half> %mul
+}
 
-;define dso_local <8 x half> @test_vmulq_lane_f16(<8 x half> %a, <4 x half> %b) {
-;entry:
-;  %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-;  %mul = fmul <8 x half> %shuffle, %a
-;  ret <8 x half> %mul
-;}
+define dso_local <8 x half> @test_vmulq_lane_f16(<8 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: test_vmulq_lane_f16:
+; CHECK:         vmul.f16 q0, q0, d2[3]
+; CHECK-NEXT:    bx lr
+entry:
+  %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = fmul <8 x half> %shuffle, %a
+  ret <8 x half> %mul
+}
 
-;define dso_local <4 x half> @test_vmul_n_f16(<4 x half> %a, float %b.coerce) {
-;entry:
-;  %0 = bitcast float %b.coerce to i32
-;  %tmp.0.extract.trunc = trunc i32 %0 to i16
-;  %1 = bitcast i16 %tmp.0.extract.trunc to half
-;  %vecinit = insertelement <4 x half> undef, half %1, i32 0
-;  %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
-;  %mul = fmul <4 x half> %vecinit4, %a
-;  ret <4 x half> %mul
-;}
+define dso_local <4 x half> @test_vmul_n_f16(<4 x half> %a, float %b.coerce) {
+; CHECK-LABEL: test_vmul_n_f16:
+; CHECK:         vmul.f16 d0, d0, d1[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %vecinit = insertelement <4 x half> undef, half %1, i32 0
+  %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x half> %vecinit4, %a
+  ret <4 x half> %mul
+}
 
-;define dso_local <8 x half> @test_vmulq_n_f16(<8 x half> %a, float %b.coerce) {
-;entry:
-;  %0 = bitcast float %b.coerce to i32
-;  %tmp.0.extract.trunc = trunc i32 %0 to i16
-;  %1 = bitcast i16 %tmp.0.extract.trunc to half
-;  %vecinit = insertelement <8 x half> undef, half %1, i32 0
-;  %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
-;  %mul = fmul <8 x half> %vecinit8, %a
-;  ret <8 x half> %mul
-;}
+define dso_local <8 x half> @test_vmulq_n_f16(<8 x half> %a, float %b.coerce) {
+; CHECK-LABEL: test_vmulq_n_f16:
+; CHECK:         vmul.f16 q0, q0, d2[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %vecinit = insertelement <8 x half> undef, half %1, i32 0
+  %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
+  %mul = fmul <8 x half> %vecinit8, %a
+  ret <8 x half> %mul
+}
 
 define dso_local <4 x half> @test_vbsl_f16(<4 x i16> %a, <4 x half> %b, <4 x half> %c) {
 ; CHECKLABEL: test_vbsl_f16: