[llvm] [SLP] Use the correct calling convention for vector math routines (PR #180759)

Tue Feb 10 07:50:56 PST 2026

https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/180759

When vectorising calls to math intrinsics such as llvm.pow we
correctly detect and generate calls to the corresponding vector
math variant. However, we don't pick up and use the calling
convention for the vector math function. This matters for veclibs
such as ArmPL where the aarch64_vector_pcs calling convention
can improve codegen by reducing the number of registers that
need saving across calls.

>From 7a365f882aa6728b9935939023b3af3e7c20b924 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 10 Feb 2026 15:48:20 +0000
Subject: [PATCH 1/2] Add tests

---
 .../AArch64/accelerate-vector-functions.ll    | 253 +++++++++++++++++-
 1 file changed, 249 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 34d65a307016f..e31e7fb9ff81e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
+; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=ArmPL -S %s | FileCheck %s --check-prefix=CHECK-ARMPL
 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -15,6 +16,12 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_sin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_sin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -58,6 +65,12 @@ define <4 x float> @ceil_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @ceil_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @ceil_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -90,6 +103,12 @@ define <4 x float> @fabs_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @fabs_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -120,6 +139,12 @@ define <4 x float> @int_fabs_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_fabs_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -150,6 +175,12 @@ define <4 x float> @floor_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @floor_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @floor_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -180,6 +211,12 @@ define <4 x float> @sqrt_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @sqrt_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @sqrt_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -210,6 +247,12 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @exp_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @exp_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -251,6 +294,12 @@ define <4 x float> @expm1_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @expm1_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vexpm1q_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @expm1_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -292,6 +341,12 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @log_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @log_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -333,6 +388,12 @@ define <4 x float> @log1p_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @log1p_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vlog1pq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @log1p_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -385,6 +446,23 @@ define <4 x float> @log10p_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
+; CHECK-ARMPL-LABEL: @log10p_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
+; CHECK-ARMPL-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
+; CHECK-ARMPL-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-ARMPL-NEXT:    [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
+; CHECK-ARMPL-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-ARMPL-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-ARMPL-NEXT:    [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
+; CHECK-ARMPL-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-ARMPL-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-ARMPL-NEXT:    [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
+; CHECK-ARMPL-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[VECINS_3]]
+;
 ; NOACCELERATE-LABEL: @log10p_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -426,6 +504,23 @@ define <4 x float> @logb_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @logb_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
+; CHECK-ARMPL-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP1]], i32 0
+; CHECK-ARMPL-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-ARMPL-NEXT:    [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
+; CHECK-ARMPL-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-ARMPL-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-ARMPL-NEXT:    [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
+; CHECK-ARMPL-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-ARMPL-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-ARMPL-NEXT:    [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
+; CHECK-ARMPL-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[VECINS_3]]
+;
 ; NOACCELERATE-LABEL: @logb_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -467,6 +562,12 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @sin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @sin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -508,6 +609,12 @@ define <4 x float> @cos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @cos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @cos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -549,6 +656,12 @@ define <4 x float> @tan_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @tan_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @tan_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -590,6 +703,12 @@ define <4 x float> @asin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @asin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @asin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -630,6 +749,12 @@ define <4 x float> @int_asin_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_asin_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_asin_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -671,6 +796,12 @@ define <4 x float> @acos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @acos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @acos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -711,6 +842,12 @@ define <4 x float> @int_acos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_acos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_acos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -752,6 +889,12 @@ define <4 x float> @atan_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @atan_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @atan_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -792,6 +935,12 @@ define <4 x float> @int_atan_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_atan_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_atan_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -834,6 +983,13 @@ define <4 x float> @atan2_4x(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @atan2_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @atan2_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -885,6 +1041,13 @@ define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_atan2_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_atan2_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -936,6 +1099,12 @@ define <4 x float> @sinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @sinh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @sinh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -976,6 +1145,12 @@ define <4 x float> @int_sinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_sinh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_sinh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1017,6 +1192,12 @@ define <4 x float> @cosh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @cosh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @cosh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1057,6 +1238,12 @@ define <4 x float> @int_cosh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_cosh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_cosh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1098,6 +1285,12 @@ define <4 x float> @tanh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @tanh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @tanh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1138,6 +1331,12 @@ define <4 x float> @int_tanh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_tanh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_tanh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1179,6 +1378,12 @@ define <4 x float> @asinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @asinh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @asinh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1220,6 +1425,12 @@ define <4 x float> @acosh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @acosh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @acosh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1261,6 +1472,12 @@ define <4 x float> @atanh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @atanh_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @atanh_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1301,13 +1518,24 @@ define <2 x float> @sin_2x(ptr %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
+; CHECK-ARMPL-LABEL: @sin_2x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
+; CHECK-ARMPL-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
+; CHECK-ARMPL-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-ARMPL-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR3]]
+; CHECK-ARMPL-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-ARMPL-NEXT:    ret <2 x float> [[VECINS_1]]
+;
 ; NOACCELERATE-LABEL: @sin_2x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
@@ -1341,6 +1569,12 @@ define <4 x float> @int_cos_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
+; CHECK-ARMPL-LABEL: @int_cos_4x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
+;
 ; NOACCELERATE-LABEL: @int_cos_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
@@ -1381,13 +1615,24 @@ define <2 x float> @cos_2x(ptr %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
+; CHECK-ARMPL-LABEL: @cos_2x(
+; CHECK-ARMPL-NEXT:  entry:
+; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
+; CHECK-ARMPL-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR4:[0-9]+]]
+; CHECK-ARMPL-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP1]], i32 0
+; CHECK-ARMPL-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-ARMPL-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR4]]
+; CHECK-ARMPL-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-ARMPL-NEXT:    ret <2 x float> [[VECINS_1]]
+;
 ; NOACCELERATE-LABEL: @cos_2x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16

>From efbfe41bdbe6d9414cca40b98ee68c0a2a2ccdcc Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 10 Feb 2026 15:48:26 +0000
Subject: [PATCH 2/2] [SLP] Use the correct calling convention for vector math
 routines

When vectorising calls to math intrinsics such as llvm.pow we
correctly detect and generate calls to the corresponding vector
math variant. However, we don't pick up and use the calling
convention for the vector math function. This matters for veclibs
such as ArmPL where the aarch64_vector_pcs calling convention
can improve codegen by reducing the number of registers that
need saving across calls.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  1 +
 .../AArch64/accelerate-vector-functions.ll    | 52 +++++++++----------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f89c22fafcf04..10b0dfb74f9fc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21069,6 +21069,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       propagateIRFlags(V, E->Scalars, VL0);
+      cast<CallInst>(V)->setCallingConv(CF->getCallingConv());
       V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index e31e7fb9ff81e..3994fcecd286c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -19,7 +19,7 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_sin_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_sin_4x(
@@ -250,7 +250,7 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @exp_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @exp_4x(
@@ -297,7 +297,7 @@ define <4 x float> @expm1_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @expm1_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vexpm1q_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vexpm1q_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @expm1_4x(
@@ -344,7 +344,7 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @log_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @log_4x(
@@ -391,7 +391,7 @@ define <4 x float> @log1p_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @log1p_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vlog1pq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vlog1pq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @log1p_4x(
@@ -565,7 +565,7 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @sin_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sin_4x(
@@ -612,7 +612,7 @@ define <4 x float> @cos_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @cos_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @cos_4x(
@@ -659,7 +659,7 @@ define <4 x float> @tan_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @tan_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vtanq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @tan_4x(
@@ -706,7 +706,7 @@ define <4 x float> @asin_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @asin_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @asin_4x(
@@ -752,7 +752,7 @@ define <4 x float> @int_asin_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_asin_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_asin_4x(
@@ -799,7 +799,7 @@ define <4 x float> @acos_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @acos_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @acos_4x(
@@ -845,7 +845,7 @@ define <4 x float> @int_acos_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_acos_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_acos_4x(
@@ -892,7 +892,7 @@ define <4 x float> @atan_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @atan_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atan_4x(
@@ -938,7 +938,7 @@ define <4 x float> @int_atan_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_atan_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_atan_4x(
@@ -987,7 +987,7 @@ define <4 x float> @atan2_4x(ptr %a, ptr %b) {
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 ; CHECK-ARMPL-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atan2_4x(
@@ -1045,7 +1045,7 @@ define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 ; CHECK-ARMPL-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP0]], <4 x float> [[BB]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_atan2_4x(
@@ -1102,7 +1102,7 @@ define <4 x float> @sinh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @sinh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sinh_4x(
@@ -1148,7 +1148,7 @@ define <4 x float> @int_sinh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_sinh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_sinh_4x(
@@ -1195,7 +1195,7 @@ define <4 x float> @cosh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @cosh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @cosh_4x(
@@ -1241,7 +1241,7 @@ define <4 x float> @int_cosh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_cosh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_cosh_4x(
@@ -1288,7 +1288,7 @@ define <4 x float> @tanh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @tanh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @tanh_4x(
@@ -1334,7 +1334,7 @@ define <4 x float> @int_tanh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_tanh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_tanh_4x(
@@ -1381,7 +1381,7 @@ define <4 x float> @asinh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @asinh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vasinhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vasinhq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @asinh_4x(
@@ -1428,7 +1428,7 @@ define <4 x float> @acosh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @acosh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vacoshq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vacoshq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @acosh_4x(
@@ -1475,7 +1475,7 @@ define <4 x float> @atanh_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @atanh_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vatanhq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vatanhq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atanh_4x(
@@ -1572,7 +1572,7 @@ define <4 x float> @int_cos_4x(ptr %a) {
 ; CHECK-ARMPL-LABEL: @int_cos_4x(
 ; CHECK-ARMPL-NEXT:  entry:
 ; CHECK-ARMPL-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
-; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
+; CHECK-ARMPL-NEXT:    [[TMP1:%.*]] = call fast aarch64_vector_pcs <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP0]])
 ; CHECK-ARMPL-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_cos_4x(