[llvm] [ARM] Add tan intrinsic lowering (PR #95439)

Thu Jun 13 10:22:56 PDT 2024

https://github.com/farzonl created https://github.com/llvm/llvm-project/pull/95439

- `ARMISelLowering.cpp` - Add f16 type and neon and mve vector support for tan

>From 56e03d17106bb009dc9a7d90a7c1f2be2ae68f68 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.org>
Date: Thu, 13 Jun 2024 13:19:19 -0400
Subject: [PATCH] [ARM] Add tan intrinsic lowering - `ARMISelLowering.cpp` -
 Add f16 type and neon and mve vector support for tan

---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   5 +
 .../ARM/2011-11-29-128bitArithmetics.ll       |  31 +++++
 llvm/test/CodeGen/ARM/fp16-fullfp16.ll        |  18 +++
 llvm/test/CodeGen/ARM/fp16-promote.ll         |  16 +++
 llvm/test/CodeGen/ARM/vfloatintrinsics.ll     |  21 ++++
 .../CodeGen/Thumb2/float-intrinsics-double.ll |   9 ++
 .../CodeGen/Thumb2/float-intrinsics-float.ll  |   9 ++
 llvm/test/CodeGen/Thumb2/mve-fmath.ll         | 111 ++++++++++++++++++
 8 files changed, 220 insertions(+)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e3270471981cc..6faa02ec17aac 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -365,6 +365,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FSQRT, VT, Expand);
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FTAN, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
       setOperationAction(ISD::FLOG, VT, Expand);
       setOperationAction(ISD::FLOG2, VT, Expand);
@@ -875,6 +876,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
@@ -897,6 +899,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+    setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
@@ -914,6 +917,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
@@ -1540,6 +1544,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
     setOperationAction(ISD::FSIN, MVT::f16, Promote);
     setOperationAction(ISD::FCOS, MVT::f16, Promote);
+    setOperationAction(ISD::FTAN, MVT::f16, Promote);
     setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
     setOperationAction(ISD::FPOWI, MVT::f16, Promote);
     setOperationAction(ISD::FPOW, MVT::f16, Promote);
diff --git a/llvm/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/llvm/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index e14e598086249..b6ebeaae5eb6d 100644
--- a/llvm/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/llvm/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -56,6 +56,37 @@ L.entry:
 
 declare <4 x float> @llvm.cos.v4f32(<4 x float>) nounwind readonly
 
+define void @test_tan(ptr %X) nounwind {
+
+; CHECK-LABEL: test_tan:
+
+; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
+; CHECK:      movt  [[reg0]], :upper16:{{.*}}
+; CHECK:      vld1.64
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}tanf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}tanf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}tanf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}tanf
+
+; CHECK:      vst1.64
+
+L.entry:
+  %0 = load <4 x float>, ptr @A, align 16
+  %1 = call <4 x float> @llvm.tan.v4f32(<4 x float> %0)
+  store <4 x float> %1, ptr %X, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.tan.v4f32(<4 x float>) nounwind readonly
+
 define void @test_exp(ptr %X) nounwind {
 
 ; CHECK-LABEL: test_exp:
diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
index 7381d517505e8..2656cdbb0347e 100644
--- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
@@ -281,6 +281,23 @@ define void @test_cos(ptr %p) {
   ret void
 }
 
+define void @test_tan(ptr %p) {
+; CHECK-LABEL: test_tan:
+; CHECK:         .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vldr.16 s0, [r0]
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r4]
+; CHECK-NEXT:    pop {r4, pc}
+  %a = load half, ptr %p, align 2
+  %r = call half @llvm.tan.f16(half %a)
+  store half %r, ptr %p
+  ret void
+}
+
 define void @test_pow(ptr %p, ptr %q) {
 ; CHECK-LABEL: test_pow:
 ; CHECK:         .save {r4, lr}
@@ -588,6 +605,7 @@ declare half @llvm.sqrt.f16(half %a)
 declare half @llvm.powi.f16.i32(half %a, i32 %b)
 declare half @llvm.sin.f16(half %a)
 declare half @llvm.cos.f16(half %a)
+declare half @llvm.tan.f16(half %a)
 declare half @llvm.pow.f16(half %a, half %b)
 declare half @llvm.exp.f16(half %a)
 declare half @llvm.exp2.f16(half %a)
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 9c01129ff30d8..ae3b8f9920e3b 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -393,6 +393,7 @@ declare half @llvm.sqrt.f16(half %a) #0
 declare half @llvm.powi.f16.i32(half %a, i32 %b) #0
 declare half @llvm.sin.f16(half %a) #0
 declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.tan.f16(half %a) #0
 declare half @llvm.pow.f16(half %a, half %b) #0
 declare half @llvm.exp.f16(half %a) #0
 declare half @llvm.exp2.f16(half %a) #0
@@ -472,6 +473,21 @@ define void @test_cos(ptr %p) #0 {
   ret void
 }
 
+; CHECK-FP16-LABEL: test_tan:
+; CHECK-FP16: vcvtb.f32.f16
+; CHECK-FP16: bl tanf
+; CHECK-FP16: vcvtb.f16.f32
+; CHECK-LIBCALL-LABEL: test_tan:
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl tanf
+; CHECK-LIBCALL: bl __aeabi_f2h
+define void @test_tan(ptr %p) #0 {
+  %a = load half, ptr %p, align 2
+  %r = call half @llvm.tan.f16(half %a)
+  store half %r, ptr %p
+  ret void
+}
+
 ; CHECK-FP16-LABEL: test_pow:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
diff --git a/llvm/test/CodeGen/ARM/vfloatintrinsics.ll b/llvm/test/CodeGen/ARM/vfloatintrinsics.ll
index 028bb76c3d435..74782d44c7423 100644
--- a/llvm/test/CodeGen/ARM/vfloatintrinsics.ll
+++ b/llvm/test/CodeGen/ARM/vfloatintrinsics.ll
@@ -29,6 +29,12 @@ define %v2f32 @test_v2f32.cos(%v2f32 %a) {
   %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
   ret %v2f32 %1
 }
+; CHECK-LABEL: test_v2f32.tan:{{.*}}
+define %v2f32 @test_v2f32.tan(%v2f32 %a) {
+  ; CHECK: tan
+  %1 = call %v2f32 @llvm.tan.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
 ; CHECK-LABEL: test_v2f32.pow:{{.*}}
 define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
   ; CHECK: pow
@@ -112,6 +118,7 @@ declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
 declare %v2f32 @llvm.powi.v2f32.i32(%v2f32, i32) #0
 declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
 declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
+declare %v2f32 @llvm.tan.v2f32(%v2f32) #0
 declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
 declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
 declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
@@ -153,6 +160,12 @@ define %v4f32 @test_v4f32.cos(%v4f32 %a) {
   %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
   ret %v4f32 %1
 }
+; CHECK-LABEL: test_v4f32.tan:{{.*}}
+define %v4f32 @test_v4f32.tan(%v4f32 %a) {
+  ; CHECK: tan
+  %1 = call %v4f32 @llvm.tan.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
 ; CHECK-LABEL: test_v4f32.pow:{{.*}}
 define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
   ; CHECK: pow
@@ -236,6 +249,7 @@ declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
 declare %v4f32 @llvm.powi.v4f32.i32(%v4f32, i32) #0
 declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
 declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
+declare %v4f32 @llvm.tan.v4f32(%v4f32) #0
 declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
 declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
 declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
@@ -277,6 +291,12 @@ define %v2f64 @test_v2f64.cos(%v2f64 %a) {
   %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
   ret %v2f64 %1
 }
+; CHECK-LABEL: test_v2f64.tan:{{.*}}
+define %v2f64 @test_v2f64.tan(%v2f64 %a) {
+  ; CHECK: tan
+  %1 = call %v2f64 @llvm.tan.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
 ; CHECK-LABEL: test_v2f64.pow:{{.*}}
 define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
   ; CHECK: pow
@@ -361,6 +381,7 @@ declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
 declare %v2f64 @llvm.powi.v2f64.i32(%v2f64, i32) #0
 declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
 declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
+declare %v2f64 @llvm.tan.v2f64(%v2f64) #0
 declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
 declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
 declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
index 70a5939865b7b..7f5da36886939 100644
--- a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
+++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
@@ -41,6 +41,15 @@ define double @cos_d(double %a) {
   ret double %1
 }
 
+declare double     @llvm.tan.f64(double %Val)
+define double @tan_d(double %a) {
+; CHECK-LABEL: tan_d:
+; SOFT: {{(bl|b)}} tan
+; HARD: b tan
+  %1 = call double @llvm.tan.f64(double %a)
+  ret double %1
+}
+
 declare double     @llvm.pow.f64(double %Val, double %power)
 define double @pow_d(double %a, double %b) {
 ; CHECK-LABEL: pow_d:
diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
index b6b891edd0461..94ba9b218a072 100644
--- a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
+++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
@@ -42,6 +42,15 @@ define float @cos_f(float %a) {
   ret float %1
 }
 
+declare float     @llvm.tan.f32(float %Val)
+define float @tan_f(float %a) {
+; CHECK-LABEL: tan_f:
+; SOFT: bl tanf
+; HARD: b tanf
+  %1 = call float @llvm.tan.f32(float %a)
+  ret float %1
+}
+
 declare float     @llvm.pow.f32(float %Val, float %power)
 define float @pow_f(float %a, float %b) {
 ; CHECK-LABEL: pow_f:
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index c299b62a4c942..d747da76a45fa 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -288,6 +288,117 @@ entry:
   ret <2 x double> %0
 }
 
+define arm_aapcs_vfpcc <4 x float> @tan_float32_t(<4 x float> %src) {
+; CHECK-LABEL: tan_float32_t:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov r0, r4, d9
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov r4, r1, d8
+; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov s18, r5
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s17, r0
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s16, r0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %0 = call fast <4 x float> @llvm.tan.v4f32(<4 x float> %src)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @tan_float16_t(<8 x half> %src) {
+; CHECK-LABEL: tan_float16_t:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
+; CHECK-NEXT:    vmov s16, r0
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = call fast <8 x half> @llvm.tan.v8f16(<8 x half> %src)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <2 x double> @tan_float64_t(<2 x double> %src) {
+; CHECK-LABEL: tan_float64_t:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    bl tan
+; CHECK-NEXT:    vmov r2, r3, d8
+; CHECK-NEXT:    vmov d9, r0, r1
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    bl tan
+; CHECK-NEXT:    vmov d8, r0, r1
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = call fast <2 x double> @llvm.tan.v2f64(<2 x double> %src)
+  ret <2 x double> %0
+}
+
 define arm_aapcs_vfpcc <4 x float> @exp_float32_t(<4 x float> %src) {
 ; CHECK-LABEL: exp_float32_t:
 ; CHECK:       @ %bb.0: @ %entry