[llvm] 59dcf92 - [TLI][AArch64] Extend SLEEF vectorized functions mapping with VLA functions

Wed Mar 29 03:08:11 PDT 2023

Author: Paul Osmialowski
Date: 2023-03-29T11:07:35+01:00
New Revision: 59dcf927ee43e995374907b6846b657f68d7ea49

URL: https://github.com/llvm/llvm-project/commit/59dcf927ee43e995374907b6846b657f68d7ea49
DIFF: https://github.com/llvm/llvm-project/commit/59dcf927ee43e995374907b6846b657f68d7ea49.diff

LOG: [TLI][AArch64] Extend SLEEF vectorized functions mapping with VLA functions

This commit extends D134719 "[AArch64] Enable libm vectorized
functions via SLEEF" with the mappings for the scalable functions.

It also introduces all the necessary changes needed to support masked
interfaces.

Signed-off-by: Paul Osmialowski <pawel.osmialowski at arm.com>

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetLibraryInfo.h
    llvm/include/llvm/Analysis/VecFuncs.def
    llvm/include/llvm/Analysis/VectorUtils.h
    llvm/lib/Analysis/TargetLibraryInfo.cpp
    llvm/lib/Analysis/VectorUtils.cpp
    llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 1626798ba1c21..1850b7a599606 100644

--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -31,6 +31,7 @@ struct VecDesc {
   StringRef ScalarFnName;
   StringRef VectorFnName;
   ElementCount VectorizationFactor;
+  bool Masked;
 };
 
   enum LibFunc : unsigned {
@@ -161,7 +162,8 @@ class TargetLibraryInfoImpl {
   /// Return true if the function F has a vector equivalent with vectorization
   /// factor VF.
   bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const {
-    return !getVectorizedFunction(F, VF).empty();
+    return !(getVectorizedFunction(F, VF, false).empty() &&
+             getVectorizedFunction(F, VF, true).empty());
   }
 
   /// Return true if the function F has a vector equivalent with any
@@ -170,7 +172,8 @@ class TargetLibraryInfoImpl {
 
   /// Return the name of the equivalent of F, vectorized with factor VF. If no
   /// such mapping exists, return the empty string.
-  StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const;
+  StringRef getVectorizedFunction(StringRef F, const ElementCount &VF,
+                                  bool Masked) const;
 
   /// Set to true iff i32 parameters to library functions should have signext
   /// or zeroext attributes if they correspond to C-level int or unsigned int,
@@ -346,8 +349,9 @@ class TargetLibraryInfo {
   bool isFunctionVectorizable(StringRef F) const {
     return Impl->isFunctionVectorizable(F);
   }
-  StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const {
-    return Impl->getVectorizedFunction(F, VF);
+  StringRef getVectorizedFunction(StringRef F, const ElementCount &VF,
+                                  bool Masked = false) const {
+    return Impl->getVectorizedFunction(F, VF, Masked);
   }
 
   /// Tests if the function is both available and a candidate for optimized code

diff  --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 34f5b3904cb07..f5aaa49be9b30 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -19,9 +19,11 @@
 
 #define FIXED(NL) ElementCount::getFixed(NL)
 #define SCALABLE(NL) ElementCount::getScalable(NL)
+#define NOMASK false
+#define MASKED true
 
 #if !(defined(TLI_DEFINE_VECFUNC))
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF},
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF, NOMASK},
 #endif
 
 #if defined(TLI_DEFINE_ACCELERATE_VECFUNCS)
@@ -604,10 +606,91 @@ TLI_DEFINE_VECFUNC( "llvm.tanh.f32", "_ZGVnN4v_tanhf", FIXED(4))
 TLI_DEFINE_VECFUNC( "tgammaf", "_ZGVnN4v_tgammaf", FIXED(4))
 TLI_DEFINE_VECFUNC( "llvm.tgamma.f32", "_ZGVnN4v_tgammaf", FIXED(4))
 
+#elif defined(TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS)
+
+TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("lgamma", "_ZGVsMxv_lgamma",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("lgammaf", "_ZGVsMxv_lgammaf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("sqrt", "_ZGVsMxv_sqrt",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("sqrtf", "_ZGVsMxv_sqrtf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED)
+
+TLI_DEFINE_VECFUNC("tgamma", "_ZGVsMxv_tgamma",  SCALABLE(2), MASKED)
+TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4), MASKED)
+
 #else
 #error "Must choose which vector library functions are to be defined."
 #endif
 
+#undef MASKED
+#undef NOMASK
+#undef SCALABLE
+#undef FIXED
+
 #undef TLI_DEFINE_VECFUNC
 #undef TLI_DEFINE_ACCELERATE_VECFUNCS
 #undef TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
@@ -616,4 +699,5 @@ TLI_DEFINE_VECFUNC( "llvm.tgamma.f32", "_ZGVnN4v_tgammaf", FIXED(4))
 #undef TLI_DEFINE_SVML_VECFUNCS
 #undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
 #undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
+#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
 #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES

diff  --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index da223f30e81a7..b19b1029a03f2 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -192,7 +192,7 @@ std::optional<VFInfo> tryDemangleForVFABI(StringRef MangledName,
 /// where:
 ///
 /// <isa> = "_LLVM_"
-/// <mask> = "N". Note: TLI does not support masked interfaces.
+/// <mask> = "M" if masked, "N" if no mask.
 /// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
 ///          field of the `VecDesc` struct. If the number of lanes is scalable
 ///          then 'x' is printed instead.
@@ -200,7 +200,8 @@ std::optional<VFInfo> tryDemangleForVFABI(StringRef MangledName,
 /// <scalarname> = the name of the scalar function.
 /// <vectorname> = the name of the vector function.
 std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName,
-                                unsigned numArgs, ElementCount VF);
+                                unsigned numArgs, ElementCount VF,
+                                bool Masked = false);
 
 /// Retrieve the `VFParamKind` from a string token.
 VFParamKind getVFParamKindFromString(const StringRef Token);

diff  --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c57c2beeb0281..666144158362f 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1181,10 +1181,17 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
   case SLEEFGNUABI: {
     const VecDesc VecFuncs_VF2[] = {
 #define TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF, /* MASK = */ false},
 #include "llvm/Analysis/VecFuncs.def"
     };
     const VecDesc VecFuncs_VF4[] = {
 #define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF, /* MASK = */ false},
+#include "llvm/Analysis/VecFuncs.def"
+    };
+    const VecDesc VecFuncs_VFScalable[] = {
+#define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK) {SCAL, VEC, VF, MASK},
 #include "llvm/Analysis/VecFuncs.def"
     };
 
@@ -1195,6 +1202,7 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     case llvm::Triple::aarch64_be:
       addVectorizableFunctions(VecFuncs_VF2);
       addVectorizableFunctions(VecFuncs_VF4);
+      addVectorizableFunctions(VecFuncs_VFScalable);
       break;
     }
     break;
@@ -1214,16 +1222,16 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
   return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
 }
 
-StringRef
-TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
-                                             const ElementCount &VF) const {
+StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
+                                                       const ElementCount &VF,
+                                                       bool Masked) const {
   F = sanitizeFunctionName(F);
   if (F.empty())
     return F;
   std::vector<VecDesc>::const_iterator I =
       llvm::lower_bound(VectorDescs, F, compareWithScalarFnName);
   while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
-    if (I->VectorizationFactor == VF)
+    if ((I->VectorizationFactor == VF) && (I->Masked == Masked))
       return I->VectorFnName;
     ++I;
   }

diff  --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 31c2de15d602d..cab0082332ad0 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1529,10 +1529,10 @@ void InterleaveGroup<Instruction>::addMetadata(Instruction *NewInst) const {
 
 std::string VFABI::mangleTLIVectorName(StringRef VectorName,
                                        StringRef ScalarName, unsigned numArgs,
-                                       ElementCount VF) {
+                                       ElementCount VF, bool Masked) {
   SmallString<256> Buffer;
   llvm::raw_svector_ostream Out(Buffer);
-  Out << "_ZGV" << VFABI::_LLVM_ << "N";
+  Out << "_ZGV" << VFABI::_LLVM_ << (Masked ? "M" : "N");
   if (VF.isScalable())
     Out << 'x';
   else

diff  --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 55bcb6f3b1218..873c3222be50e 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -40,7 +40,7 @@ STATISTIC(NumCompUsedAdded,
 /// CI (other than void) need to be widened to a VectorType of VF
 /// lanes.
 static void addVariantDeclaration(CallInst &CI, const ElementCount &VF,
-                                  const StringRef VFName) {
+                                  bool Predicate, const StringRef VFName) {
   Module *M = CI.getModule();
 
   // Add function declaration.
@@ -50,6 +50,8 @@ static void addVariantDeclaration(CallInst &CI, const ElementCount &VF,
     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
   assert(!CI.getFunctionType()->isVarArg() &&
          "VarArg functions are not supported.");
+  if (Predicate)
+    Tys.push_back(ToVectorTy(Type::getInt1Ty(RetTy->getContext()), VF));
   FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
   Function *VectorF =
       Function::Create(FTy, Function::ExternalLinkage, VFName, M);
@@ -89,19 +91,19 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
   const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(),
                                                    Mappings.end());
 
-  auto AddVariantDecl = [&](const ElementCount &VF) {
+  auto AddVariantDecl = [&](const ElementCount &VF, bool Predicate) {
     const std::string TLIName =
-        std::string(TLI.getVectorizedFunction(ScalarName, VF));
+        std::string(TLI.getVectorizedFunction(ScalarName, VF, Predicate));
     if (!TLIName.empty()) {
-      std::string MangledName =
-          VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF);
+      std::string MangledName = VFABI::mangleTLIVectorName(
+          TLIName, ScalarName, CI.arg_size(), VF, Predicate);
       if (!OriginalSetOfMappings.count(MangledName)) {
         Mappings.push_back(MangledName);
         ++NumCallInjected;
       }
       Function *VariantF = M->getFunction(TLIName);
       if (!VariantF)
-        addVariantDeclaration(CI, VF, TLIName);
+        addVariantDeclaration(CI, VF, Predicate, TLIName);
     }
   };
 
@@ -109,13 +111,15 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
   ElementCount WidestFixedVF, WidestScalableVF;
   TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF);
 
-  for (ElementCount VF = ElementCount::getFixed(2);
-       ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2)
-    AddVariantDecl(VF);
+  for (bool Predicated : {false, true}) {
+    for (ElementCount VF = ElementCount::getFixed(2);
+         ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2)
+      AddVariantDecl(VF, Predicated);
 
-  // TODO: Add scalable variants once we're able to test them.
-  assert(WidestScalableVF.isZero() &&
-         "Scalable vector mappings not yet supported");
+    for (ElementCount VF = ElementCount::getScalable(2);
+         ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2)
+      AddVariantDecl(VF, Predicated);
+  }
 
   VFABI::setVectorVariantNames(&CI, Mappings);
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
index be4309996fbb5..0d14eccf79fb3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
@@ -1,5 +1,6 @@
 ; Do NOT use -O3. It will lower exp2 to ldexp, and the test will fail.
-; RUN: opt -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s
+; RUN: opt -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,NEON
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,SVE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
@@ -11,7 +12,8 @@ declare float @llvm.acos.f32(float) #0
 
 define void @acos_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @acos_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -34,7 +36,8 @@ define void @acos_f64(double* nocapture %varray) {
 
 define void @acos_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @acos_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP4:%.*]])
+  ; NEON:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP4:%.*]])
+  ; SVE:     [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -62,7 +65,8 @@ declare float @llvm.asin.f32(float) #0
 
 define void @asin_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @asin_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -85,7 +89,8 @@ define void @asin_f64(double* nocapture %varray) {
 
 define void @asin_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @asin_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -113,7 +118,8 @@ declare float @llvm.atan.f32(float) #0
 
 define void @atan_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @atan_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -136,7 +142,8 @@ define void @atan_f64(double* nocapture %varray) {
 
 define void @atan_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @atan_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -164,7 +171,8 @@ declare float @llvm.atan2.f32(float, float) #0
 
 define void @atan2_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @atan2_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -187,7 +195,8 @@ define void @atan2_f64(double* nocapture %varray) {
 
 define void @atan2_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @atan2_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -215,7 +224,8 @@ declare float @llvm.atanh.f32(float) #0
 
 define void @atanh_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @atanh_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -238,7 +248,8 @@ define void @atanh_f64(double* nocapture %varray) {
 
 define void @atanh_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @atanh_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -266,7 +277,8 @@ declare float @llvm.cos.f32(float) #0
 
 define void @cos_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @cos_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -289,7 +301,8 @@ define void @cos_f64(double* nocapture %varray) {
 
 define void @cos_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @cos_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -317,7 +330,8 @@ declare float @llvm.cosh.f32(float) #0
 
 define void @cosh_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @cosh_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -340,7 +354,8 @@ define void @cosh_f64(double* nocapture %varray) {
 
 define void @cosh_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @cosh_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -368,7 +383,8 @@ declare float @llvm.exp.f32(float) #0
 
 define void @exp_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @exp_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -391,7 +407,8 @@ define void @exp_f64(double* nocapture %varray) {
 
 define void @exp_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @exp_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -419,7 +436,8 @@ declare float @llvm.exp2.f32(float) #0
 
 define void @exp2_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @exp2_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -442,7 +460,8 @@ define void @exp2_f64(double* nocapture %varray) {
 
 define void @exp2_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @exp2_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -470,7 +489,8 @@ declare float @llvm.exp10.f32(float) #0
 
 define void @exp10_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @exp10_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -493,7 +513,8 @@ define void @exp10_f64(double* nocapture %varray) {
 
 define void @exp10_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @exp10_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -521,7 +542,8 @@ declare float @llvm.lgamma.f32(float) #0
 
 define void @lgamma_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @lgamma_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -544,7 +566,8 @@ define void @lgamma_f64(double* nocapture %varray) {
 
 define void @lgamma_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @lgamma_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -572,7 +595,8 @@ declare float @llvm.log10.f32(float) #0
 
 define void @log10_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @log10_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -595,7 +619,8 @@ define void @log10_f64(double* nocapture %varray) {
 
 define void @log10_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @log10_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -674,7 +699,8 @@ declare float @llvm.log.f32(float) #0
 
 define void @log_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @log_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -697,7 +723,8 @@ define void @log_f64(double* nocapture %varray) {
 
 define void @log_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @log_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -725,7 +752,8 @@ declare float @llvm.pow.f32(float, float) #0
 
 define void @pow_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @pow_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -748,7 +776,8 @@ define void @pow_f64(double* nocapture %varray) {
 
 define void @pow_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @pow_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -776,7 +805,8 @@ declare float @llvm.sin.f32(float) #0
 
 define void @sin_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @sin_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -799,7 +829,8 @@ define void @sin_f64(double* nocapture %varray) {
 
 define void @sin_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @sin_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -827,7 +858,8 @@ declare float @llvm.sinh.f32(float) #0
 
 define void @sinh_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @sinh_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -850,7 +882,8 @@ define void @sinh_f64(double* nocapture %varray) {
 
 define void @sinh_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @sinh_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -878,7 +911,8 @@ declare float @llvm.sqrt.f32(float) #0
 
 define void @sqrt_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @sqrt_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -901,7 +935,8 @@ define void @sqrt_f64(double* nocapture %varray) {
 
 define void @sqrt_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @sqrt_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -922,10 +957,10 @@ define void @sqrt_f32(float* nocapture %varray) {
   ret void
 }
 
-	
 define void @llvm_sqrt_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @llvm_sqrt_f64(
-  ; CHECK:    [[TMP5:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP4:%.*]])
   ; CHECK:    ret void
   ;
   entry:
@@ -948,7 +983,8 @@ define void @llvm_sqrt_f64(double* nocapture %varray) {
 
 define void @llvm_sqrt_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @llvm_sqrt_f32(
-  ; CHECK:    [[TMP5:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP4:%.*]])
   ; CHECK:    ret void
   ;
   entry:
@@ -976,7 +1012,8 @@ declare float @llvm.tan.f32(float) #0
 
 define void @tan_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @tan_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -999,7 +1036,8 @@ define void @tan_f64(double* nocapture %varray) {
 
 define void @tan_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @tan_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -1027,7 +1065,8 @@ declare float @llvm.tanh.f32(float) #0
 
 define void @tanh_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @tanh_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -1050,7 +1089,8 @@ define void @tanh_f64(double* nocapture %varray) {
 
 define void @tanh_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @tanh_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -1078,7 +1118,8 @@ declare float @llvm.tgamma.f32(float) #0
 
 define void @tgamma_f64(double* nocapture %varray) {
   ; CHECK-LABEL: @tgamma_f64(
-  ; CHECK:    [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry:
@@ -1101,7 +1142,8 @@ define void @tgamma_f64(double* nocapture %varray) {
 
 define void @tgamma_f32(float* nocapture %varray) {
   ; CHECK-LABEL: @tgamma_f32(
-  ; CHECK:    [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP4:%.*]])
+  ; NEON:     [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP4:%.*]])
+  ; SVE:      [[TMP5:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
   ; CHECK:    ret void
   ;
   entry: