[llvm] [SLP] Use the correct calling convention for vector math routines (PR #180759)

Tue Feb 10 07:51:38 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: David Sherwood (david-arm)

<details>
<summary>Changes</summary>

When vectorising calls to math intrinsics such as llvm.pow we
correctly detect and generate calls to the corresponding vector
math variant. However, we don't pick up and use the calling
convention for the vector math function. This matters for veclibs
such as ArmPL where the aarch64_vector_pcs calling convention
can improve codegen by reducing the number of registers that
need saving across calls.

---

Patch is 28.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/180759.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+1) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll (+249-4) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f89c22fafcf04..10b0dfb74f9fc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21069,6 +21069,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       propagateIRFlags(V, E->Scalars, VL0);
+      cast<CallInst>(V)->setCallingConv(CF->getCallingConv());
       V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 34d65a307016f..3994fcecd286c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
+; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=ArmPL -S %s | FileCheck %s --check-prefix=CHECK-ARMPL
 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -15,6 +16,12 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_sin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_sin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -58,6 +65,12 @@ define <4 x float> @ceil_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @ceil_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @ceil_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -90,6 +103,12 @@ define <4 x float> @fabs_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @fabs_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -120,6 +139,12 @@ define <4 x float> @int_fabs_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_fabs_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -150,6 +175,12 @@ define <4 x float> @floor_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @floor_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @floor_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -180,6 +211,12 @@ define <4 x float> @sqrt_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @sqrt_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @sqrt_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -210,6 +247,12 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @exp_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @exp_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -251,6 +294,12 @@ define <4 x float> @expm1_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @expm1_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vexpm1q_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @expm1_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -292,6 +341,12 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @log_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @log_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -333,6 +388,12 @@ define <4 x float> @log1p_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @log1p_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vlog1pq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @log1p_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -385,6 +446,23 @@ define <4 x float> @log10p_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
+; CHECK-ARMPL-LABEL: @log10p_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
+; CHECK-ARMPL-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
+; CHECK-ARMPL-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-ARMPL-NEXT:    [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
+; CHECK-ARMPL-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-ARMPL-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-ARMPL-NEXT:    [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
+; CHECK-ARMPL-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-ARMPL-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-ARMPL-NEXT:    [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
+; CHECK-ARMPL-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[VECINS_3]]
+;
 ; NOACCELERATE-LABEL: @log10p_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -426,6 +504,23 @@ define <4 x float> @logb_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @logb_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
+; CHECK-ARMPL-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
+; CHECK-ARMPL-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-ARMPL-NEXT:    [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
+; CHECK-ARMPL-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-ARMPL-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-ARMPL-NEXT:    [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
+; CHECK-ARMPL-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-ARMPL-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-ARMPL-NEXT:    [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
+; CHECK-ARMPL-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[VECINS_3]]
+;
 ; NOACCELERATE-LABEL: @logb_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -467,6 +562,12 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @sin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @sin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -508,6 +609,12 @@ define <4 x float> @cos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @cos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @cos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -549,6 +656,12 @@ define <4 x float> @tan_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @tan_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vtanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @tan_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -590,6 +703,12 @@ define <4 x float> @asin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @asin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @asin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -630,6 +749,12 @@ define <4 x float> @int_asin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_asin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_asin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -671,6 +796,12 @@ define <4 x float> @acos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @acos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @acos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -711,6 +842,12 @@ define <4 x float> @int_acos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_acos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_acos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -752,6 +889,12 @@ define <4 x float> @atan_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @atan_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @atan_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -792,6 +935,12 @@ define <4 x float> @int_atan_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_atan_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_atan_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -834,6 +983,13 @@ define <4 x float> @atan2_4x(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @atan2_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @atan2_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -885,6 +1041,13 @@ define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_atan2_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_atan2_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -936,6 +1099,12 @@ define <4 x float> @sinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @sinh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @sinh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -976,6 +1145,12 @@ define <4 x float> @int_sinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_sinh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_sinh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1017,6 +1192,12 @@ define <4 x float> @cosh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @cosh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @cosh_4x(
 ; NOACCELER...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/180759