[llvm-dev] ARM vectorized fp16 support

Wed Sep 4 22:52:46 PDT 2019

Hi,

I'm trying to compile half precision program for ARM, while it seems
LLVM fails to automatically generate fused-multiply-add instructions
for c += a * b. I'm wondering whether I did something wrong, if not,
is it a missing feature that will be supported later? (I know there're
fp16 FMLA intrinsics though)

Test programs and outputs,

$ clang -O3 -march=armv8.2-a+fp16fml -ffast-math -S -o- vfp32.c
test_vfma_lane_f16:                     // @test_vfma_lane_f16
                fmla       v2.4s, v1.4s, v0.4s   // fp32 is GOOD
                mov       v0.16b, v2.16b
                ret
$ cat vfp32.c
#include <arm_neon.h>
float32x4_t test_vfma_lane_f16(float32x4_t a, float32x4_t b, float32x4_t c) {
  c += a * b;
  return c;
}

$ clang -O3 -march=armv8.2-a+fp16fml -ffast-math -S -o- vfp16.c
test_vfma_lane_f16:                     // @test_vfma_lane_f16
                fmul       v0.4h, v1.4h, v0.4h
                fadd       v0.4h, v0.4h, v2.4h  // fp16 does NOT use FMLA
                ret
$ cat vfp16.c
#include <arm_neon.h>
float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
  c += a * b;
  return c;
}

-- 
Yizhi Liu