[llvm] [TLI] Pass replace-with-veclib works with Scalable Vectors. (PR #73642)

Paschalis Mpeis via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 14 01:06:15 PST 2023


https://github.com/paschalis-mpeis updated https://github.com/llvm/llvm-project/pull/73642

>From 73d59dd54cc972bab2fac190973da010289c25ea Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Fri, 1 Dec 2023 17:56:59 +0000
Subject: [PATCH 01/10] [VFABI] Create FunctionType for vector functions

`createFunctionType` optionally returns a FunctionType and the mask's
position when there's one. It requires VFInfo and an Instruction.

Add `checkFunctionType` in 'VectorFunctionABITest.cpp' tests to check
that both the number and the type of vectorized parameters matches the
created `FunctionType`.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |   7 +
 llvm/lib/Analysis/VFABIDemangling.cpp         |   2 +-
 llvm/lib/Analysis/VectorUtils.cpp             |  45 +++++
 .../Analysis/VectorFunctionABITest.cpp        | 157 ++++++++++++++----
 4 files changed, 181 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 55a6aa645a86e2..734c440283b4a0 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -195,6 +195,13 @@ static constexpr char const *MappingsAttrName = "vector-function-abi-variant";
 /// the presence of the attribute (see InjectTLIMappings).
 void getVectorVariantNames(const CallInst &CI,
                            SmallVectorImpl<std::string> &VariantMappings);
+
+/// Returns a pair of the vectorized FunctionType and the mask's position when
+/// there's one, otherwise -1. It rejects any non vectorized calls as this
+/// method should be called at a point where the Instruction \p I is already
+/// vectorized.
+std::optional<std::pair<FunctionType *, int>>
+createFunctionType(const VFInfo &Info, const Instruction *I, const Module *M);
 } // end namespace VFABI
 
 /// The Vector Function Database.
diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 92af314a41caad..fc94a33851963c 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -376,7 +376,7 @@ std::optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName,
   // _ZGV<isa><mask><vlen><parameters>_<scalarname>.
   StringRef VectorName = MangledName;
 
-  // Parse the fixed size part of the manled name
+  // Parse the fixed size part of the mangled name
   if (!MangledName.consume_front("_ZGV"))
     return std::nullopt;
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 91d8c31fa062de..c31f0f3bd2fd58 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
+#include <optional>
 
 #define DEBUG_TYPE "vectorutils"
 
@@ -1477,6 +1479,49 @@ void VFABI::getVectorVariantNames(
   }
 }
 
+// Returns whether any of the operands or return type of \p I are vectors.
+static bool isVectorized(const Instruction *I) {
+  if (I->getType()->isVectorTy())
+    return true;
+  for (auto &U : I->operands())
+    if (U->getType()->isVectorTy())
+      return true;
+  return false;
+}
+
+std::optional<std::pair<FunctionType *, int>>
+VFABI::createFunctionType(const VFInfo &Info, const Instruction *I,
+                          const Module *M) {
+  // only vectorized calls should reach this method
+  if (!isVectorized(I))
+    return std::nullopt;
+
+  ElementCount VF = Info.Shape.VF;
+  // get vectorized operands
+  const bool IsCall = isa<CallBase>(I);
+  SmallVector<Type *, 8> VecParams;
+  for (auto [i, U] : enumerate(I->operands())) {
+    // ignore the function pointer when the Instruction is a call
+    if (IsCall && i == I->getNumOperands() - 1)
+      break;
+    VecParams.push_back(U->getType());
+  }
+
+  // Append a mask and get its position.
+  int MaskPos = -1;
+  if (Info.isMasked()) {
+    auto OptMaskPos = Info.getParamIndexForOptionalMask();
+    if (!OptMaskPos)
+      return std::nullopt;
+
+    MaskPos = OptMaskPos.value();
+    VectorType *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VF);
+    VecParams.insert(VecParams.begin() + MaskPos, MaskTy);
+  }
+  FunctionType *VecFTy = FunctionType::get(I->getType(), VecParams, false);
+  return std::make_pair(VecFTy, MaskPos);
+}
+
 bool VFShape::hasValidParameterList() const {
   for (unsigned Pos = 0, NumParams = Parameters.size(); Pos < NumParams;
        ++Pos) {
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index 201dd1127ef234..85177db74ef585 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/InstIterator.h"
 #include "gtest/gtest.h"
+#include <optional>
 
 using namespace llvm;
 
@@ -91,6 +92,77 @@ class VFABIParserTest : public ::testing::Test {
   bool matchParametersNum() {
     return (Parameters.size() - isMasked()) == ScalarFTy->getNumParams();
   }
+
+  /// Creates a mock CallInst and uses it along with VFInfo to create a
+  /// FunctionType. Then it checks that the created FunctionType matches the
+  /// number and type of arguments with both the ScalarFTy and the operands of
+  /// the call.
+  bool checkFunctionType() {
+    // Create a mock vectorized CallInst using dummy values and then use it to
+    // create a vector FunctionType. In the case of scalable ISAs, the created
+    // vector FunctionType might have a mask parameter Type, however, this input
+    // CallInst will not have a mask operand.
+    SmallVector<Value *, 8> Args;
+    SmallVector<Type *, 8> CallTypes;
+    for (auto [VFParam, STy] :
+         zip(Info.Shape.Parameters, ScalarFTy->params())) {
+      // use VectorType where relevant, according to VShape
+      Type *UseTy = STy;
+      if (VFParam.ParamKind == VFParamKind::Vector)
+        UseTy = VectorType::get(STy, Info.Shape.VF);
+
+      CallTypes.push_back(UseTy);
+      Args.push_back(Constant::getNullValue(UseTy));
+    }
+
+    // Mangled names do not currently encode return Type information. Generally,
+    // return types are vectors, so use one.
+    Type *RetTy = ScalarFTy->getReturnType();
+    if (!RetTy->isVoidTy())
+      RetTy = VectorType::get(RetTy, Info.Shape.VF);
+
+    FunctionCallee F = M->getOrInsertFunction(
+        VectorName, FunctionType::get(RetTy, CallTypes, false));
+    std::unique_ptr<CallInst> CI(CallInst::Create(F, Args));
+
+    // Use VFInfo and the mock CallInst to create a FunctionType that will
+    // include a mask where relevant.
+    auto OptVecFTyPos = VFABI::createFunctionType(Info, CI.get(), M.get());
+    if (!OptVecFTyPos)
+      return false;
+
+    FunctionType *VecFTy = OptVecFTyPos->first;
+    // Check that vectorized parameters' size match with VFInfo.
+    // Both may include a mask.
+    if ((VecFTy->getNumParams() != Info.Shape.Parameters.size()))
+      return false;
+
+    // Check if the types of the vectorized parameters from the created
+    // FunctionType match with the arguments passed to the CallInst. Any masks
+    // are ignored, as the original, mock CallInst does not have one.
+    auto VecParams = VecFTy->params();
+    for (auto [VecTy, VFTyParam] : zip(CallTypes, VecParams))
+      if (VecTy != VFTyParam)
+        return false;
+
+    // Check if the types of the scalar and vector FunctionTypes match.
+    // In the case of a mask, the vector FunctionType should have an additional
+    // i1 vector parameter.
+    if (ScalarFTy->getReturnType() != VecFTy->getReturnType()->getScalarType())
+      return false;
+    auto ScalarParams = ScalarFTy->params();
+    for (auto [OptSTy, OptVTy] : zip_longest(ScalarParams, VecParams)) {
+      Type *VTy = OptVTy.value();
+      // ensure the extra vector Type is a mask
+      if (!OptSTy && VTy->isVectorTy() &&
+          VTy->getScalarType() != Type::getInt1Ty(M->getContext()))
+        return false;
+      if (OptSTy && OptSTy.value() != VTy->getScalarType())
+        return false;
+    }
+
+    return true;
+  }
 };
 } // unnamed namespace
 
@@ -130,7 +202,8 @@ TEST_F(VFABIParserTest, ParamListParsing) {
       invokeParser("_ZGVnN2vl16Ls32R3l_foo", "void(i32, i32, i32, ptr, i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_EQ(false, isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)5);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector, 0}));
   EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::OMP_Linear, 16}));
@@ -145,7 +218,8 @@ TEST_F(VFABIParserTest, ScalarNameAndVectorName_01) {
   EXPECT_TRUE(invokeParser("_ZGVnM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_EQ(true, isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(ScalarName, "foo");
   EXPECT_EQ(VectorName, "vector_foo");
 }
@@ -154,7 +228,8 @@ TEST_F(VFABIParserTest, ScalarNameAndVectorName_02) {
   EXPECT_TRUE(invokeParser("_ZGVnM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_EQ(true, isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(ScalarName, "foo");
   EXPECT_EQ(VectorName, "vector_foo");
 }
@@ -164,7 +239,8 @@ TEST_F(VFABIParserTest, ScalarNameAndVectorName_03) {
       invokeParser("_ZGVnM2v___foo_bar_abc(fooBarAbcVec)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_EQ(true, isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(ScalarName, "__foo_bar_abc");
   EXPECT_EQ(VectorName, "fooBarAbcVec");
 }
@@ -185,7 +261,8 @@ TEST_F(VFABIParserTest, Parse) {
                    "void(i32, i32, i32, i32, ptr, i32, i32, i32, ptr)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)9);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector, 0}));
@@ -205,7 +282,8 @@ TEST_F(VFABIParserTest, ParseVectorName) {
   EXPECT_TRUE(invokeParser("_ZGVnN2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector, 0}));
@@ -218,7 +296,8 @@ TEST_F(VFABIParserTest, LinearWithCompileTimeNegativeStep) {
                            "void(i32, i32, i32, ptr)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_FALSE(checkFunctionType()); // invalid: all operands are scalar
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)4);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Linear, -1}));
@@ -233,7 +312,8 @@ TEST_F(VFABIParserTest, ParseScalableSVE) {
   EXPECT_TRUE(invokeParser("_ZGVsMxv_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getScalable(4));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -246,7 +326,8 @@ TEST_F(VFABIParserTest, ParseFixedWidthSVE) {
   EXPECT_TRUE(invokeParser("_ZGVsM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -280,7 +361,8 @@ TEST_F(VFABIParserTest, LinearWithoutCompileTime) {
                            "void(i32, i32, ptr, i32, i32, i32, ptr, i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_FALSE(checkFunctionType()); // invalid: all operands are scalar
   EXPECT_EQ(Parameters.size(), (unsigned)8);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Linear, 1}));
   EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::OMP_LinearVal, 1}));
@@ -299,7 +381,8 @@ TEST_F(VFABIParserTest, LLVM_ISA) {
   EXPECT_TRUE(invokeParser("_ZGV_LLVM_N2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::LLVM);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
   EXPECT_EQ(ScalarName, "foo");
@@ -318,7 +401,7 @@ TEST_F(VFABIParserTest, Align) {
   EXPECT_TRUE(invokeParser("_ZGVsN2l2a2_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0].Alignment, Align(2));
   EXPECT_EQ(ScalarName, "foo");
@@ -341,7 +424,8 @@ TEST_F(VFABIParserTest, ParseUniform) {
   EXPECT_TRUE(invokeParser("_ZGVnN2u_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_FALSE(checkFunctionType()); // invalid: all operands are scalar
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Uniform, 0}));
@@ -374,8 +458,8 @@ TEST_F(VFABIParserTest, ISAIndependentMangling) {
   do {                                                                         \
     EXPECT_EQ(VF, ElementCount::getFixed(2));                                  \
     EXPECT_FALSE(isMasked());                                                  \
-    EXPECT_TRUE(matchParametersNum())                                          \
-        << "Different number of scalar parameters";                            \
+    EXPECT_TRUE(matchParametersNum());                                         \
+    EXPECT_TRUE(checkFunctionType());                                          \
     EXPECT_EQ(Parameters.size(), (unsigned)10);                                \
     EXPECT_EQ(Parameters, ExpectedParams);                                     \
     EXPECT_EQ(ScalarName, "foo");                                              \
@@ -450,7 +534,8 @@ TEST_F(VFABIParserTest, ParseMaskingNEON) {
   EXPECT_TRUE(invokeParser("_ZGVnM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -463,7 +548,8 @@ TEST_F(VFABIParserTest, ParseMaskingSVE) {
   EXPECT_TRUE(invokeParser("_ZGVsM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -476,7 +562,8 @@ TEST_F(VFABIParserTest, ParseMaskingSSE) {
   EXPECT_TRUE(invokeParser("_ZGVbM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SSE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -489,7 +576,8 @@ TEST_F(VFABIParserTest, ParseMaskingAVX) {
   EXPECT_TRUE(invokeParser("_ZGVcM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AVX);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -502,7 +590,8 @@ TEST_F(VFABIParserTest, ParseMaskingAVX2) {
   EXPECT_TRUE(invokeParser("_ZGVdM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AVX2);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -515,7 +604,8 @@ TEST_F(VFABIParserTest, ParseMaskingAVX512) {
   EXPECT_TRUE(invokeParser("_ZGVeM2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::AVX512);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -528,7 +618,8 @@ TEST_F(VFABIParserTest, ParseMaskingLLVM) {
   EXPECT_TRUE(invokeParser("_ZGV_LLVM_M2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::LLVM);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -546,7 +637,8 @@ TEST_F(VFABIParserTest, LLVM_InternalISA) {
   EXPECT_TRUE(invokeParser("_ZGV_LLVM_N2v_foo(vector_foo)", "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::LLVM);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
   EXPECT_EQ(ScalarName, "foo");
@@ -558,7 +650,8 @@ TEST_F(VFABIParserTest, IntrinsicsInLLVMIsa) {
                            "void(float, float)"));
   EXPECT_EQ(ISA, VFISAKind::LLVM);
   EXPECT_FALSE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(4));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -573,7 +666,8 @@ TEST_F(VFABIParserTest, ParseScalableRequiresDeclaration) {
   EXPECT_TRUE(invokeParser(MangledName, "void(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
   EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::GlobalPredicate}));
@@ -592,7 +686,8 @@ TEST_F(VFABIParserTest, ParseScalableMaskingSVE) {
   EXPECT_TRUE(invokeParser("_ZGVsMxv_foo(vector_foo)", "i32(i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getScalable(4));
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -606,7 +701,8 @@ TEST_F(VFABIParserTest, ParseScalableMaskingSVESincos) {
                            "void(double, ptr, ptr)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getScalable(2));
   EXPECT_EQ(Parameters.size(), (unsigned)4);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
@@ -623,7 +719,8 @@ TEST_F(VFABIParserTest, ParseWiderReturnTypeSVE) {
   EXPECT_TRUE(invokeParser("_ZGVsMxvv_foo(vector_foo)", "i64(i32, i32)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)3);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
   EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::Vector}));
@@ -638,7 +735,8 @@ TEST_F(VFABIParserTest, ParseVoidReturnTypeSVE) {
   EXPECT_TRUE(invokeParser("_ZGVsMxv_foo(vector_foo)", "void(i16)"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   EXPECT_TRUE(isMasked());
-  EXPECT_TRUE(matchParametersNum()) << "Different number of scalar parameters";
+  EXPECT_TRUE(matchParametersNum());
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)2);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector}));
   EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::GlobalPredicate}));
@@ -656,6 +754,7 @@ TEST_F(VFABIParserTest, ParseUnsupportedElementTypeSVE) {
 TEST_F(VFABIParserTest, ParseUnsupportedReturnTypeSVE) {
   EXPECT_FALSE(invokeParser("_ZGVsMxv_foo(vector_foo)", "fp128(float)"));
 }
+
 class VFABIAttrTest : public testing::Test {
 protected:
   void SetUp() override {

>From c95fe95111af988881865a327fd5641d9ca6239d Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Mon, 11 Dec 2023 18:02:37 +0000
Subject: [PATCH 02/10] Handle already masked Instructions.

'createFunctionType' should be able to create the correct FunctionType,
regardless of whether the input Instruction was masked or not.
It uses VFInfo to figure out if the input Instruction was already
masked, and if so it does not append another mask Type.

In checks, create two mock CallInsts, one with a mask and one without
and verify that they have the same number of parameters.
---
 llvm/lib/Analysis/VectorUtils.cpp             | 10 ++++---
 .../Analysis/VectorFunctionABITest.cpp        | 27 +++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index c31f0f3bd2fd58..f4dc26da812250 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1507,7 +1507,7 @@ VFABI::createFunctionType(const VFInfo &Info, const Instruction *I,
     VecParams.push_back(U->getType());
   }
 
-  // Append a mask and get its position.
+  // Get mask's position mask and append one if not present in the Instruction.
   int MaskPos = -1;
   if (Info.isMasked()) {
     auto OptMaskPos = Info.getParamIndexForOptionalMask();
@@ -1515,8 +1515,12 @@ VFABI::createFunctionType(const VFInfo &Info, const Instruction *I,
       return std::nullopt;
 
     MaskPos = OptMaskPos.value();
-    VectorType *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VF);
-    VecParams.insert(VecParams.begin() + MaskPos, MaskTy);
+    // append a mask only when it's missing
+    if (VecParams.size() == Info.Shape.Parameters.size() - 1) {
+      VectorType *MaskTy =
+          VectorType::get(Type::getInt1Ty(M->getContext()), VF);
+      VecParams.insert(VecParams.begin() + MaskPos, MaskTy);
+    }
   }
   FunctionType *VecFTy = FunctionType::get(I->getType(), VecParams, false);
   return std::make_pair(VecFTy, MaskPos);
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index 85177db74ef585..8daebb6042156d 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -131,6 +131,33 @@ class VFABIParserTest : public ::testing::Test {
     if (!OptVecFTyPos)
       return false;
 
+    // Ensure that masked Instructions are handled
+    if (isMasked()) {
+      // In case of a masked call, try creating another mock CallInst that is
+      // masked. createFunctionType should be able to handle this.
+      SmallVector<Type *, 8> CallTypesInclMask(CallTypes);
+      SmallVector<Value *, 8> ArgsInclMask(Args);
+      Type *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VF);
+      CallTypesInclMask.push_back(MaskTy);
+      ArgsInclMask.push_back(Constant::getNullValue(MaskTy));
+
+      FunctionCallee FMasked = M->getOrInsertFunction(
+          VectorName + "_Masked",
+          FunctionType::get(RetTy, CallTypesInclMask, false));
+      std::unique_ptr<CallInst> CIMasked(
+          CallInst::Create(FMasked, ArgsInclMask));
+      auto OptVecFTyMaskedPos =
+          VFABI::createFunctionType(Info, CIMasked.get(), M.get());
+      if (!OptVecFTyMaskedPos)
+        return false;
+
+      // Both FunctionTypes should have the same number of parameters.
+      assert(
+          (OptVecFTyPos->first->getNumParams() ==
+           OptVecFTyMaskedPos->first->getNumParams()) &&
+          "createFunctionType should accept masked or non masked Instructions");
+    }
+
     FunctionType *VecFTy = OptVecFTyPos->first;
     // Check that vectorized parameters' size match with VFInfo.
     // Both may include a mask.

>From 29f11b1f0ae6eb5a10cce3c3b87a68d41accb584 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Tue, 12 Dec 2023 17:33:18 +0000
Subject: [PATCH 03/10] Simplified createFunctionType method

It accepts ScalarFTy and VecRetTy. As the latter is not kept in VFABI,
it should instead come from the original Instruction/Callinst.

This change allows further simplification in tests. Also, methods like
the test `_ZGVnN3lLRUlnLnRnUn_foo`, which contains only scalar parameters,
would still return a valid FunctionType. This is an edge case that will
only be encountered in tests though.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |  7 ++-
 llvm/lib/Analysis/VectorUtils.cpp             | 43 +++++----------
 .../Analysis/VectorFunctionABITest.cpp        | 53 +++++--------------
 3 files changed, 28 insertions(+), 75 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 734c440283b4a0..6b3a001f7c6070 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -197,11 +197,10 @@ void getVectorVariantNames(const CallInst &CI,
                            SmallVectorImpl<std::string> &VariantMappings);
 
 /// Returns a pair of the vectorized FunctionType and the mask's position when
-/// there's one, otherwise -1. It rejects any non vectorized calls as this
-/// method should be called at a point where the Instruction \p I is already
-/// vectorized.
+/// there's one, otherwise -1.
 std::optional<std::pair<FunctionType *, int>>
-createFunctionType(const VFInfo &Info, const Instruction *I, const Module *M);
+createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy,
+                   Type *VecRetTy, const Module *M);
 } // end namespace VFABI
 
 /// The Vector Function Database.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index f4dc26da812250..19529e0f012a30 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1479,32 +1479,17 @@ void VFABI::getVectorVariantNames(
   }
 }
 
-// Returns whether any of the operands or return type of \p I are vectors.
-static bool isVectorized(const Instruction *I) {
-  if (I->getType()->isVectorTy())
-    return true;
-  for (auto &U : I->operands())
-    if (U->getType()->isVectorTy())
-      return true;
-  return false;
-}
-
 std::optional<std::pair<FunctionType *, int>>
-VFABI::createFunctionType(const VFInfo &Info, const Instruction *I,
-                          const Module *M) {
-  // only vectorized calls should reach this method
-  if (!isVectorized(I))
-    return std::nullopt;
-
+VFABI::createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy,
+                          Type *VecRetTy, const Module *M) {
   ElementCount VF = Info.Shape.VF;
-  // get vectorized operands
-  const bool IsCall = isa<CallBase>(I);
-  SmallVector<Type *, 8> VecParams;
-  for (auto [i, U] : enumerate(I->operands())) {
-    // ignore the function pointer when the Instruction is a call
-    if (IsCall && i == I->getNumOperands() - 1)
-      break;
-    VecParams.push_back(U->getType());
+  // Create vector parameter types
+  SmallVector<Type *, 8> VecTypes;
+  for (auto [STy, VFParam] : zip(ScalarFTy->params(), Info.Shape.Parameters)) {
+    if (VFParam.ParamKind == VFParamKind::Vector)
+      VecTypes.push_back(VectorType::get(STy, VF));
+    else
+      VecTypes.push_back(STy);
   }
 
   // Get mask's position mask and append one if not present in the Instruction.
@@ -1515,14 +1500,10 @@ VFABI::createFunctionType(const VFInfo &Info, const Instruction *I,
       return std::nullopt;
 
     MaskPos = OptMaskPos.value();
-    // append a mask only when it's missing
-    if (VecParams.size() == Info.Shape.Parameters.size() - 1) {
-      VectorType *MaskTy =
-          VectorType::get(Type::getInt1Ty(M->getContext()), VF);
-      VecParams.insert(VecParams.begin() + MaskPos, MaskTy);
-    }
+    VectorType *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VF);
+    VecTypes.insert(VecTypes.begin() + MaskPos, MaskTy);
   }
-  FunctionType *VecFTy = FunctionType::get(I->getType(), VecParams, false);
+  FunctionType *VecFTy = FunctionType::get(VecRetTy, VecTypes, false);
   return std::make_pair(VecFTy, MaskPos);
 }
 
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index 8daebb6042156d..e02a3b1c03c1f5 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -98,10 +98,9 @@ class VFABIParserTest : public ::testing::Test {
   /// number and type of arguments with both the ScalarFTy and the operands of
   /// the call.
   bool checkFunctionType() {
-    // Create a mock vectorized CallInst using dummy values and then use it to
-    // create a vector FunctionType. In the case of scalable ISAs, the created
-    // vector FunctionType might have a mask parameter Type, however, this input
-    // CallInst will not have a mask operand.
+    // For scalable ISAs, the created vector FunctionType might have a mask
+    // parameter Type, according to VFABI. Regardless, this input CallInst,
+    // despite being a vectorized call, it will not have a masked operand.
     SmallVector<Value *, 8> Args;
     SmallVector<Type *, 8> CallTypes;
     for (auto [VFParam, STy] :
@@ -117,47 +116,21 @@ class VFABIParserTest : public ::testing::Test {
 
     // Mangled names do not currently encode return Type information. Generally,
     // return types are vectors, so use one.
-    Type *RetTy = ScalarFTy->getReturnType();
-    if (!RetTy->isVoidTy())
-      RetTy = VectorType::get(RetTy, Info.Shape.VF);
+    Type *VecRetTy = ScalarFTy->getReturnType();
+    if (!VecRetTy->isVoidTy())
+      VecRetTy = VectorType::get(VecRetTy, Info.Shape.VF);
 
     FunctionCallee F = M->getOrInsertFunction(
-        VectorName, FunctionType::get(RetTy, CallTypes, false));
+        VectorName, FunctionType::get(VecRetTy, CallTypes, false));
     std::unique_ptr<CallInst> CI(CallInst::Create(F, Args));
 
     // Use VFInfo and the mock CallInst to create a FunctionType that will
-    // include a mask where relevant.
-    auto OptVecFTyPos = VFABI::createFunctionType(Info, CI.get(), M.get());
+    // include a mask when relevant.
+    auto OptVecFTyPos =
+        VFABI::createFunctionType(Info, ScalarFTy, VecRetTy, M.get());
     if (!OptVecFTyPos)
       return false;
 
-    // Ensure that masked Instructions are handled
-    if (isMasked()) {
-      // In case of a masked call, try creating another mock CallInst that is
-      // masked. createFunctionType should be able to handle this.
-      SmallVector<Type *, 8> CallTypesInclMask(CallTypes);
-      SmallVector<Value *, 8> ArgsInclMask(Args);
-      Type *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VF);
-      CallTypesInclMask.push_back(MaskTy);
-      ArgsInclMask.push_back(Constant::getNullValue(MaskTy));
-
-      FunctionCallee FMasked = M->getOrInsertFunction(
-          VectorName + "_Masked",
-          FunctionType::get(RetTy, CallTypesInclMask, false));
-      std::unique_ptr<CallInst> CIMasked(
-          CallInst::Create(FMasked, ArgsInclMask));
-      auto OptVecFTyMaskedPos =
-          VFABI::createFunctionType(Info, CIMasked.get(), M.get());
-      if (!OptVecFTyMaskedPos)
-        return false;
-
-      // Both FunctionTypes should have the same number of parameters.
-      assert(
-          (OptVecFTyPos->first->getNumParams() ==
-           OptVecFTyMaskedPos->first->getNumParams()) &&
-          "createFunctionType should accept masked or non masked Instructions");
-    }
-
     FunctionType *VecFTy = OptVecFTyPos->first;
     // Check that vectorized parameters' size match with VFInfo.
     // Both may include a mask.
@@ -324,7 +297,7 @@ TEST_F(VFABIParserTest, LinearWithCompileTimeNegativeStep) {
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
   EXPECT_TRUE(matchParametersNum());
-  EXPECT_FALSE(checkFunctionType()); // invalid: all operands are scalar
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)4);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Linear, -1}));
@@ -389,7 +362,7 @@ TEST_F(VFABIParserTest, LinearWithoutCompileTime) {
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
   EXPECT_TRUE(matchParametersNum());
-  EXPECT_FALSE(checkFunctionType()); // invalid: all operands are scalar
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(Parameters.size(), (unsigned)8);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Linear, 1}));
   EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::OMP_LinearVal, 1}));
@@ -452,7 +425,7 @@ TEST_F(VFABIParserTest, ParseUniform) {
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
   EXPECT_TRUE(matchParametersNum());
-  EXPECT_FALSE(checkFunctionType()); // invalid: all operands are scalar
+  EXPECT_TRUE(checkFunctionType());
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Uniform, 0}));

>From 97ea3b75d5f53ba0096c9ea560c86cbe2af12c70 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 13 Dec 2023 14:15:18 +0000
Subject: [PATCH 04/10] createFunctionType requires only VFInfo and ScalarFTy

Not returning a pair anymore as the position can be queries directly from
VFInfo, which createFunctionType needs to have as an argument to begin
with.

Also getting the return type from the ScalarFTy, as the specification
does not encode in the mangled name such information. Therefore, the
VFInfo does not hold such info. If that changes, then it will make its
way into VFInfo and one could get it from there.
---
 llvm/include/llvm/Analysis/VectorUtils.h         | 11 ++++++-----
 llvm/lib/Analysis/VectorUtils.cpp                | 16 ++++++++++------
 .../unittests/Analysis/VectorFunctionABITest.cpp |  7 +++----
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 6b3a001f7c6070..fdb74a2431d138 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -196,11 +196,12 @@ static constexpr char const *MappingsAttrName = "vector-function-abi-variant";
 void getVectorVariantNames(const CallInst &CI,
                            SmallVectorImpl<std::string> &VariantMappings);
 
-/// Returns a pair of the vectorized FunctionType and the mask's position when
-/// there's one, otherwise -1.
-std::optional<std::pair<FunctionType *, int>>
-createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy,
-                   Type *VecRetTy, const Module *M);
+/// Returns a vectorized FunctionType that was previously found in
+/// TargetLibraryInfo. It uses \p ScalarFTy for the types, and \p Info to get
+/// the vectorization factor and whether a particular parameter is indeed a
+/// vector, since some of them may be scalars.
+std::optional<FunctionType *> createFunctionType(const VFInfo &Info,
+                                                 const FunctionType *ScalarFTy);
 } // end namespace VFABI
 
 /// The Vector Function Database.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 19529e0f012a30..9a8de7974ba921 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
@@ -1479,9 +1480,8 @@ void VFABI::getVectorVariantNames(
   }
 }
 
-std::optional<std::pair<FunctionType *, int>>
-VFABI::createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy,
-                          Type *VecRetTy, const Module *M) {
+std::optional<FunctionType *>
+VFABI::createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy) {
   ElementCount VF = Info.Shape.VF;
   // Create vector parameter types
   SmallVector<Type *, 8> VecTypes;
@@ -1500,11 +1500,15 @@ VFABI::createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy,
       return std::nullopt;
 
     MaskPos = OptMaskPos.value();
-    VectorType *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VF);
+    VectorType *MaskTy =
+        VectorType::get(Type::getInt1Ty(ScalarFTy->getContext()), VF);
     VecTypes.insert(VecTypes.begin() + MaskPos, MaskTy);
   }
-  FunctionType *VecFTy = FunctionType::get(VecRetTy, VecTypes, false);
-  return std::make_pair(VecFTy, MaskPos);
+  auto *RetTy = ScalarFTy->getReturnType();
+  if (!RetTy->isVoidTy())
+    RetTy = VectorType::get(ScalarFTy->getReturnType(), VF);
+  FunctionType *VecFTy = FunctionType::get(RetTy, VecTypes, false);
+  return VecFTy;
 }
 
 bool VFShape::hasValidParameterList() const {
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index e02a3b1c03c1f5..e94a79c1714e67 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -126,12 +126,11 @@ class VFABIParserTest : public ::testing::Test {
 
     // Use VFInfo and the mock CallInst to create a FunctionType that will
     // include a mask when relevant.
-    auto OptVecFTyPos =
-        VFABI::createFunctionType(Info, ScalarFTy, VecRetTy, M.get());
-    if (!OptVecFTyPos)
+    auto OptVecFTy = VFABI::createFunctionType(Info, ScalarFTy);
+    if (!OptVecFTy)
       return false;
 
-    FunctionType *VecFTy = OptVecFTyPos->first;
+    FunctionType *VecFTy = *OptVecFTy;
     // Check that vectorized parameters' size match with VFInfo.
     // Both may include a mask.
     if ((VecFTy->getNumParams() != Info.Shape.Parameters.size()))

>From 3f1e423f3a06410d7a9f787f5d1a3f89453f3b57 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Thu, 14 Dec 2023 08:47:06 +0000
Subject: [PATCH 05/10] Cleanup: addressing review.

---
 llvm/include/llvm/Analysis/VectorUtils.h        |  4 ++--
 llvm/lib/Analysis/VectorUtils.cpp               | 17 ++++++-----------
 .../Analysis/VectorFunctionABITest.cpp          |  5 ++---
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index fdb74a2431d138..ccf4a292daeb82 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -200,8 +200,8 @@ void getVectorVariantNames(const CallInst &CI,
 /// TargetLibraryInfo. It uses \p ScalarFTy for the types, and \p Info to get
 /// the vectorization factor and whether a particular parameter is indeed a
 /// vector, since some of them may be scalars.
-std::optional<FunctionType *> createFunctionType(const VFInfo &Info,
-                                                 const FunctionType *ScalarFTy);
+FunctionType *createFunctionType(const VFInfo &Info,
+                                 const FunctionType *ScalarFTy);
 } // end namespace VFABI
 
 /// The Vector Function Database.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 9a8de7974ba921..c64ca1d845626b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1480,8 +1480,8 @@ void VFABI::getVectorVariantNames(
   }
 }
 
-std::optional<FunctionType *>
-VFABI::createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy) {
+FunctionType *VFABI::createFunctionType(const VFInfo &Info,
+                                        const FunctionType *ScalarFTy) {
   ElementCount VF = Info.Shape.VF;
   // Create vector parameter types
   SmallVector<Type *, 8> VecTypes;
@@ -1493,22 +1493,17 @@ VFABI::createFunctionType(const VFInfo &Info, const FunctionType *ScalarFTy) {
   }
 
   // Get mask's position mask and append one if not present in the Instruction.
-  int MaskPos = -1;
-  if (Info.isMasked()) {
-    auto OptMaskPos = Info.getParamIndexForOptionalMask();
+  if (auto OptMaskPos = Info.getParamIndexForOptionalMask()) {
     if (!OptMaskPos)
-      return std::nullopt;
-
-    MaskPos = OptMaskPos.value();
+      return nullptr;
     VectorType *MaskTy =
         VectorType::get(Type::getInt1Ty(ScalarFTy->getContext()), VF);
-    VecTypes.insert(VecTypes.begin() + MaskPos, MaskTy);
+    VecTypes.insert(VecTypes.begin() + OptMaskPos.value(), MaskTy);
   }
   auto *RetTy = ScalarFTy->getReturnType();
   if (!RetTy->isVoidTy())
     RetTy = VectorType::get(ScalarFTy->getReturnType(), VF);
-  FunctionType *VecFTy = FunctionType::get(RetTy, VecTypes, false);
-  return VecFTy;
+  return FunctionType::get(RetTy, VecTypes, false);
 }
 
 bool VFShape::hasValidParameterList() const {
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index e94a79c1714e67..5307493b6c5b3e 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -126,11 +126,10 @@ class VFABIParserTest : public ::testing::Test {
 
     // Use VFInfo and the mock CallInst to create a FunctionType that will
     // include a mask when relevant.
-    auto OptVecFTy = VFABI::createFunctionType(Info, ScalarFTy);
-    if (!OptVecFTy)
+    FunctionType *VecFTy = VFABI::createFunctionType(Info, ScalarFTy);
+    if (!VecFTy)
       return false;
 
-    FunctionType *VecFTy = *OptVecFTy;
     // Check that vectorized parameters' size match with VFInfo.
     // Both may include a mask.
     if ((VecFTy->getNumParams() != Info.Shape.Parameters.size()))

>From 3f9626b8b2ea7841e517bb855479e66839ed0d06 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Fri, 24 Nov 2023 13:44:45 +0000
Subject: [PATCH 06/10] [NFC][TLI] Improve tests for ArmPL and SLEEF
 Intrinsics.

Auto-generate test `armpl-intrinsics.ll`, and use active lane mask to
have shorter `shufflevector` check lines.

Update scripts now add `@llvm.compiler.used` instead of using the regex:
`@[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]]`
---
 .../AArch64/sleef-intrinsic-calls-aarch64.ll  | 190 +++++++++++-------
 1 file changed, 114 insertions(+), 76 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
index 2300ce74996e39..83898374c1c6c5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
@@ -139,8 +139,9 @@ define void @llvm_cos_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV:%.*]]) #[[ATTR4:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -168,8 +169,9 @@ define void @llvm_cos_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV:%.*]]) #[[ATTR5:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -200,8 +202,9 @@ define void @llvm_exp_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV:%.*]]) #[[ATTR6:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -229,8 +232,9 @@ define void @llvm_exp_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV:%.*]]) #[[ATTR7:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -261,8 +265,9 @@ define void @llvm_exp2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV:%.*]]) #[[ATTR8:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -290,8 +295,9 @@ define void @llvm_exp2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV:%.*]]) #[[ATTR9:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -322,8 +328,9 @@ define void @llvm_exp10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[CONV:%.*]]) #[[ATTR10:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -351,8 +358,9 @@ define void @llvm_exp10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[CONV:%.*]]) #[[ATTR11:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -383,8 +391,9 @@ define void @llvm_fabs_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.fabs.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -413,8 +422,9 @@ define void @llvm_fabs_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.fabs.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -445,8 +455,9 @@ define void @llvm_floor_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.floor.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -474,8 +485,9 @@ define void @llvm_floor_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.floor.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -506,8 +518,9 @@ define void @llvm_fma_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x double> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x double> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[CONV:%.*]], double [[CONV]], double [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -535,8 +548,9 @@ define void @llvm_fma_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x float> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[CONV:%.*]], float [[CONV]], float [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -567,8 +581,9 @@ define void @llvm_log_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV:%.*]]) #[[ATTR12:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -596,8 +611,9 @@ define void @llvm_log_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV:%.*]]) #[[ATTR13:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -628,8 +644,9 @@ define void @llvm_log10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV:%.*]]) #[[ATTR14:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -657,8 +674,9 @@ define void @llvm_log10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV:%.*]]) #[[ATTR15:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -689,8 +707,9 @@ define void @llvm_log2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV:%.*]]) #[[ATTR16:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -718,8 +737,9 @@ define void @llvm_log2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV:%.*]]) #[[ATTR17:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -750,8 +770,9 @@ define void @llvm_maxnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.maxnum.f64(double [[CONV:%.*]], double [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -779,8 +800,9 @@ define void @llvm_maxnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.maxnum.f32(float [[CONV:%.*]], float [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -811,8 +833,9 @@ define void @llvm_minnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.minnum.f64(double [[CONV:%.*]], double [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -840,8 +863,9 @@ define void @llvm_minnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.minnum.f32(float [[CONV:%.*]], float [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -872,8 +896,9 @@ define void @llvm_nearbyint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.nearbyint.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -901,8 +926,9 @@ define void @llvm_nearbyint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.nearbyint.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -933,8 +959,9 @@ define void @llvm_pow_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[CONV:%.*]], double [[CONV]]) #[[ATTR18:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -962,8 +989,9 @@ define void @llvm_pow_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[CONV:%.*]], float [[CONV]]) #[[ATTR19:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -994,8 +1022,9 @@ define void @llvm_rint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.rint.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1023,8 +1052,9 @@ define void @llvm_rint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.rint.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1055,8 +1085,9 @@ define void @llvm_round_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.round.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1084,8 +1115,9 @@ define void @llvm_round_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.round.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1116,8 +1148,9 @@ define void @llvm_sin_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV:%.*]]) #[[ATTR20:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -1145,8 +1178,9 @@ define void @llvm_sin_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV:%.*]]) #[[ATTR21:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -1177,8 +1211,9 @@ define void @llvm_sqrt_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1206,8 +1241,9 @@ define void @llvm_sqrt_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1238,8 +1274,9 @@ define void @llvm_trunc_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.trunc.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1267,8 +1304,9 @@ define void @llvm_trunc_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.trunc.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:

>From 50e0cb7ea4cc7b99c8f15565a01598e00f07962a Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Mon, 27 Nov 2023 17:36:29 +0000
Subject: [PATCH 07/10] Add `simplifycfg` pass and `noalias` to ensure tail
 folding.

`noalias` attribute was added only to the `%in.ptr` parameter of the
ArmPL Intrinsics.
---
 .../AArch64/sleef-intrinsic-calls-aarch64.ll  | 190 +++++++-----------
 1 file changed, 76 insertions(+), 114 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
index 83898374c1c6c5..2300ce74996e39 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
@@ -139,9 +139,8 @@ define void @llvm_cos_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV:%.*]]) #[[ATTR4:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -169,9 +168,8 @@ define void @llvm_cos_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV:%.*]]) #[[ATTR5:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -202,9 +200,8 @@ define void @llvm_exp_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV:%.*]]) #[[ATTR6:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -232,9 +229,8 @@ define void @llvm_exp_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV:%.*]]) #[[ATTR7:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -265,9 +261,8 @@ define void @llvm_exp2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV:%.*]]) #[[ATTR8:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -295,9 +290,8 @@ define void @llvm_exp2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV:%.*]]) #[[ATTR9:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -328,9 +322,8 @@ define void @llvm_exp10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[CONV:%.*]]) #[[ATTR10:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -358,9 +351,8 @@ define void @llvm_exp10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[CONV:%.*]]) #[[ATTR11:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -391,9 +383,8 @@ define void @llvm_fabs_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.fabs.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -422,9 +413,8 @@ define void @llvm_fabs_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.fabs.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -455,9 +445,8 @@ define void @llvm_floor_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.floor.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -485,9 +474,8 @@ define void @llvm_floor_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.floor.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -518,9 +506,8 @@ define void @llvm_fma_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x double> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[CONV:%.*]], double [[CONV]], double [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x double> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -548,9 +535,8 @@ define void @llvm_fma_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x float> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[CONV:%.*]], float [[CONV]], float [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -581,9 +567,8 @@ define void @llvm_log_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV:%.*]]) #[[ATTR12:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -611,9 +596,8 @@ define void @llvm_log_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV:%.*]]) #[[ATTR13:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -644,9 +628,8 @@ define void @llvm_log10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV:%.*]]) #[[ATTR14:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -674,9 +657,8 @@ define void @llvm_log10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV:%.*]]) #[[ATTR15:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -707,9 +689,8 @@ define void @llvm_log2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV:%.*]]) #[[ATTR16:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -737,9 +718,8 @@ define void @llvm_log2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV:%.*]]) #[[ATTR17:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -770,9 +750,8 @@ define void @llvm_maxnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.maxnum.f64(double [[CONV:%.*]], double [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -800,9 +779,8 @@ define void @llvm_maxnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.maxnum.f32(float [[CONV:%.*]], float [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -833,9 +811,8 @@ define void @llvm_minnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.minnum.f64(double [[CONV:%.*]], double [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -863,9 +840,8 @@ define void @llvm_minnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.minnum.f32(float [[CONV:%.*]], float [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -896,9 +872,8 @@ define void @llvm_nearbyint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.nearbyint.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -926,9 +901,8 @@ define void @llvm_nearbyint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.nearbyint.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -959,9 +933,8 @@ define void @llvm_pow_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[CONV:%.*]], double [[CONV]]) #[[ATTR18:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -989,9 +962,8 @@ define void @llvm_pow_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[CONV:%.*]], float [[CONV]]) #[[ATTR19:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1022,9 +994,8 @@ define void @llvm_rint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.rint.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1052,9 +1023,8 @@ define void @llvm_rint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.rint.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1085,9 +1055,8 @@ define void @llvm_round_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.round.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1115,9 +1084,8 @@ define void @llvm_round_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.round.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1148,9 +1116,8 @@ define void @llvm_sin_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV:%.*]]) #[[ATTR20:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1178,9 +1145,8 @@ define void @llvm_sin_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV:%.*]]) #[[ATTR21:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1211,9 +1177,8 @@ define void @llvm_sqrt_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1241,9 +1206,8 @@ define void @llvm_sqrt_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1274,9 +1238,8 @@ define void @llvm_trunc_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.trunc.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1304,9 +1267,8 @@ define void @llvm_trunc_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.trunc.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:

>From 83719090324493b78537e74715413bdf058a9eaf Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Mon, 27 Nov 2023 16:55:17 +0000
Subject: [PATCH 08/10] [TLI] Pass replace-with-veclib works with Scalable
 Vectors.

The pass uses the Masked variant of TLI method when the Intrinsic
operates on Scalable Vectors and it fails to find a non-Masked variant.
---
 llvm/lib/Analysis/VFABIDemangling.cpp         |  2 +-
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 24 ++++++-------
 .../replace-intrinsics-with-veclib-armpl.ll   | 36 +++++++++----------
 ...e-intrinsics-with-veclib-sleef-scalable.ll | 35 +++++++++---------
 4 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index fc94a33851963c..f51bfbb7ece4cf 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -126,7 +126,7 @@ static ParseRet tryParseLinearTokenWithRuntimeStep(StringRef &ParseString,
   return ParseRet::None;
 }
 
-/// The function looks for the following stringt at the beginning of
+/// The function looks for the following string at the beginning of
 /// the input string `ParseString`:
 ///
 ///  <token> <number>
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 36c91b7fa97e46..d31a793556dfde 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -105,6 +105,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // all vector operands have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
   SmallVector<Type *> ScalarTypes;
+  bool MayBeMasked = false;
   for (auto Arg : enumerate(CI.args())) {
     auto *ArgType = Arg.value()->getType();
     // Vector calls to intrinsics can still have
@@ -121,17 +122,13 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
         return false;
       }
       ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable()) {
-        // The current implementation does not support
-        // scalable vectors.
-        return false;
-      }
-      if (VF.isNonZero() && VF != NumElements) {
-        // The different arguments differ in vector size.
+      if (NumElements.isScalable())
+        MayBeMasked = true;
+
+      // The different arguments differ in vector size.
+      if (VF.isNonZero() && VF != NumElements)
         return false;
-      } else {
-        VF = NumElements;
-      }
+      VF = NumElements;
       ScalarTypes.push_back(VectorArgTy->getElementType());
     }
   }
@@ -152,11 +149,14 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
     return false;
   }
 
+  // Assume it has a mask when that is a possibility and has no mapping for
+  // a Non-Masked variant.
+  const bool IsMasked =
+      MayBeMasked && !TLI.getVectorMappingInfo(ScalarName, VF, false);
   // Try to find the mapping for the scalar version of this intrinsic
   // and the exact vector width of the call operands in the
   // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF);
-
+  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, IsMasked);
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
                     << ScalarName << "` and vector width " << VF << ".\n");
 
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
index 18431ae021f976..633cb220f52464 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
@@ -15,7 +15,7 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [32 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -95,7 +95,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
@@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -140,7 +140,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -185,7 +185,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -230,7 +230,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -275,7 +275,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -320,7 +320,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -365,7 +365,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -380,7 +380,7 @@ declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4
 ;
 ; There is a bug in the replace-with-veclib pass, and for intrinsics which take
 ; more than one arguments, but has just one overloaded type, it incorrectly
-; reconstructs the scalar name, for pow specificlly it is searching for:
+; reconstructs the scalar name, for pow specifically it is searching for:
 ; llvm.pow.f64.f64 and llvm.pow.f32.f32
 ;
 
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index 8b06c41bcb1a6d..969945590a0a18 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -5,6 +5,9 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors.
 
+;.
+; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata"
+;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
@@ -43,7 +46,7 @@ define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag,
 
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -52,7 +55,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -61,7 +64,7 @@ define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -70,7 +73,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -79,7 +82,7 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -88,7 +91,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -97,7 +100,7 @@ define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -106,7 +109,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -169,7 +172,7 @@ define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscal
 
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -178,7 +181,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -187,7 +190,7 @@ define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -196,7 +199,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -205,7 +208,7 @@ define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -214,7 +217,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -331,7 +334,7 @@ define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -340,7 +343,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)

>From b24f0e595f21bbaa17b414c02d3641d9a6422e54 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Tue, 12 Dec 2023 15:48:35 +0000
Subject: [PATCH 09/10] Use createFunctionType to correctly replace veclib
 calls.

Split replaceWithTLIFunction method into two methods.
---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 203 ++++++++++--------
 .../replace-intrinsics-with-veclib-armpl.ll   |  32 +--
 ...e-intrinsics-with-veclib-sleef-scalable.ll |  32 +--
 3 files changed, 149 insertions(+), 118 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index d31a793556dfde..ddcc55a8e52c40 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -15,15 +15,19 @@
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <optional>
 
 using namespace llvm;
 
@@ -38,138 +42,166 @@ STATISTIC(NumTLIFuncDeclAdded,
 STATISTIC(NumFuncUsedAdded,
           "Number of functions added to `llvm.compiler.used`");
 
-static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
-  Module *M = CI.getModule();
-
-  Function *OldFunc = CI.getCalledFunction();
-
-  // Check if the vector library function is already declared in this module,
-  // otherwise insert it.
+/// Returns a vector Function that it adds to the Module \p M. When an \p
+/// OptOldFunc is given, it copies its attributes to the newly created Function.
+Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
+                         std::optional<Function *> OptOldFunc,
+                         const StringRef TLIName) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
-    TLIFunc = Function::Create(OldFunc->getFunctionType(),
-                               Function::ExternalLinkage, TLIName, *M);
-    TLIFunc->copyAttributesFrom(OldFunc);
+    TLIFunc =
+        Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M);
+    if (OptOldFunc)
+      TLIFunc->copyAttributesFrom(*OptOldFunc);
 
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
                       << TLIName << "` of type `" << *(TLIFunc->getType())
                       << "` to module.\n");
 
     ++NumTLIFuncDeclAdded;
-
-    // Add the freshly created function to llvm.compiler.used,
-    // similar to as it is done in InjectTLIMappings
+    // Add the freshly created function to llvm.compiler.used, similar to as it
+    // is done in InjectTLIMappings
     appendToCompilerUsed(*M, {TLIFunc});
-
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
                       << "` to `@llvm.compiler.used`.\n");
     ++NumFuncUsedAdded;
   }
+  return TLIFunc;
+}
 
-  // Replace the call to the vector intrinsic with a call
-  // to the corresponding function from the vector library.
+/// Replace the call to the vector intrinsic ( \p OldFunc ) with a call to the
+/// corresponding function from the vector library ( \p TLIFunc ).
+static bool replaceWithTLIFunction(const Module *M, CallInst &CI,
+                                   const ElementCount &VecVF, Function *OldFunc,
+                                   Function *TLIFunc, FunctionType *VecFTy,
+                                   bool IsMasked) {
   IRBuilder<> IRBuilder(&CI);
   SmallVector<Value *> Args(CI.args());
+  if (IsMasked) {
+    if (Args.size() == VecFTy->getNumParams())
+      static_assert(true && "mask was already in place");
+
+    auto *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VecVF);
+    Args.push_back(Constant::getAllOnesValue(MaskTy));
+  }
+
   // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
   CI.getOperandBundlesAsDefs(OpBundles);
   CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
-  assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() &&
+  assert(VecFTy == TLIFunc->getFunctionType() &&
          "Expecting function types to be identical");
   CI.replaceAllUsesWith(Replacement);
-  if (isa<FPMathOperator>(Replacement)) {
-    // Preserve fast math flags for FP math.
+  // Preserve fast math flags for FP math.
+  if (isa<FPMathOperator>(Replacement))
     Replacement->copyFastMathFlags(&CI);
-  }
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `" << TLIName
-                    << "`.\n");
+                    << OldFunc->getName() << "` with call to `"
+                    << TLIFunc->getName() << "`.\n");
   ++NumCallsReplaced;
   return true;
 }
 
+/// Utility method to get the VecDesc, depending on whether there is a TLI
+/// mapping, either with or without a mask.
+static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
+                                                 const StringRef &ScalarName,
+                                                 const ElementCount &VF) {
+  const VecDesc *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true);
+  const VecDesc *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false);
+  // Invalid when there are both variants (ie masked and unmasked), or none
+  if ((VDMasked == nullptr) == (VDNoMask == nullptr))
+    return std::nullopt;
+
+  return {VDMasked != nullptr ? VDMasked : VDNoMask};
+}
+
+/// Returns whether it is able to replace a call to the intrinsic \p CI with a
+/// TLI mapped call.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
                                     CallInst &CI) {
-  if (!CI.getCalledFunction()) {
+  if (!CI.getCalledFunction())
     return false;
-  }
 
   auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
-  if (IntrinsicID == Intrinsic::not_intrinsic) {
-    // Replacement is only performed for intrinsic functions
+  // Replacement is only performed for intrinsic functions
+  if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
-  }
 
-  // Convert vector arguments to scalar type and check that
-  // all vector operands have identical vector width.
+  // Convert vector arguments to scalar type and check that all vector operands
+  // have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
   SmallVector<Type *> ScalarTypes;
-  bool MayBeMasked = false;
   for (auto Arg : enumerate(CI.args())) {
-    auto *ArgType = Arg.value()->getType();
-    // Vector calls to intrinsics can still have
-    // scalar operands for specific arguments.
+    auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
-      ScalarTypes.push_back(ArgType);
-    } else {
-      // The argument in this place should be a vector if
-      // this is a call to a vector intrinsic.
-      auto *VectorArgTy = dyn_cast<VectorType>(ArgType);
-      if (!VectorArgTy) {
-        // The argument is not a vector, do not perform
-        // the replacement.
-        return false;
-      }
-      ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable())
-        MayBeMasked = true;
-
-      // The different arguments differ in vector size.
-      if (VF.isNonZero() && VF != NumElements)
+      ScalarTypes.push_back(ArgTy);
+    } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
+      ScalarTypes.push_back(ArgTy->getScalarType());
+      // Disallow vector arguments with different VFs. When processing the first
+      // vector argument, store it's VF, and for the rest ensure that they match
+      // it.
+      if (VF.isZero())
+        VF = VectorArgTy->getElementCount();
+      else if (VF != VectorArgTy->getElementCount())
         return false;
-      VF = NumElements;
-      ScalarTypes.push_back(VectorArgTy->getElementType());
+    } else {
+      // enters when it is supposed to be a vector argument but it isn't.
+      return false;
     }
   }
 
-  // Try to reconstruct the name for the scalar version of this
-  // intrinsic using the intrinsic ID and the argument types
-  // converted to scalar above.
-  std::string ScalarName;
-  if (Intrinsic::isOverloaded(IntrinsicID)) {
-    ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule());
-  } else {
-    ScalarName = Intrinsic::getName(IntrinsicID).str();
-  }
+  // Try to reconstruct the name for the scalar version of this intrinsic using
+  // the intrinsic ID and the argument types converted to scalar above.
+  std::string ScalarName =
+      (Intrinsic::isOverloaded(IntrinsicID)
+           ? Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule())
+           : Intrinsic::getName(IntrinsicID).str());
 
-  if (!TLI.isFunctionVectorizable(ScalarName)) {
-    // The TargetLibraryInfo does not contain a vectorized version of
-    // the scalar function.
+  // The TargetLibraryInfo does not contain a vectorized version of the scalar
+  // function.
+  if (!TLI.isFunctionVectorizable(ScalarName))
     return false;
-  }
 
-  // Assume it has a mask when that is a possibility and has no mapping for
-  // a Non-Masked variant.
-  const bool IsMasked =
-      MayBeMasked && !TLI.getVectorMappingInfo(ScalarName, VF, false);
-  // Try to find the mapping for the scalar version of this intrinsic
-  // and the exact vector width of the call operands in the
-  // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, IsMasked);
+  auto OptVD = getVecDesc(TLI, ScalarName, VF);
+  if (!OptVD)
+    return false;
+
+  const VecDesc *VD = *OptVD;
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo.
+  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, VD->isMasked());
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
                     << ScalarName << "` and vector width " << VF << ".\n");
 
-  if (!TLIName.empty()) {
-    // Found the correct mapping in the TargetLibraryInfo,
-    // replace the call to the intrinsic with a call to
-    // the vector library function.
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                      << "`.\n");
-    return replaceWithTLIFunction(CI, TLIName);
-  }
+  // TLI failed to find a correct mapping.
+  if (TLIName.empty())
+    return false;
 
-  return false;
+  // Find the vector Function and replace the call to the intrinsic with a call
+  // to the vector library function.
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
+                    << "`.\n");
+
+  Type *ScalarRetTy = CI.getType()->getScalarType();
+  FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false);
+  const std::string MangledName = VD->getVectorFunctionABIVariantString();
+  auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy);
+  if (!OptInfo)
+    return false;
+
+  // get the vector FunctionType
+  Module *M = CI.getModule();
+  auto OptFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
+  if (!OptFTy)
+    return false;
+
+  Function *OldFunc = CI.getCalledFunction();
+  FunctionType *VectorFTy = *OptFTy;
+  Function *TLIFunc = getTLIFunction(M, VectorFTy, OldFunc, TLIName);
+  return replaceWithTLIFunction(M, CI, OptInfo->Shape.VF, OldFunc, TLIFunc,
+                                VectorFTy, VD->isMasked());
 }
 
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
@@ -185,9 +217,8 @@ static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
   }
   // Erase the calls to the intrinsics that have been replaced
   // with calls to the vector library.
-  for (auto *CI : ReplacedCalls) {
+  for (auto *CI : ReplacedCalls)
     CI->eraseFromParent();
-  }
   return Changed;
 }
 
@@ -207,10 +238,10 @@ PreservedAnalyses ReplaceWithVeclib::run(Function &F,
     PA.preserve<DemandedBitsAnalysis>();
     PA.preserve<OptimizationRemarkEmitterAnalysis>();
     return PA;
-  } else {
-    // The pass did not replace any calls, hence it preserves all analyses.
-    return PreservedAnalyses::all();
   }
+
+  // The pass did not replace any calls, hence it preserves all analyses.
+  return PreservedAnalyses::all();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
index 633cb220f52464..d41870ec6e7915 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
@@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -95,7 +95,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
@@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -140,7 +140,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -185,7 +185,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -230,7 +230,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -275,7 +275,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -320,7 +320,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -365,7 +365,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index 969945590a0a18..baf16f83a3e240 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -46,7 +46,7 @@ define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag,
 
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -55,7 +55,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -64,7 +64,7 @@ define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -73,7 +73,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -82,7 +82,7 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -91,7 +91,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -100,7 +100,7 @@ define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -109,7 +109,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -172,7 +172,7 @@ define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscal
 
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -181,7 +181,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -190,7 +190,7 @@ define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -199,7 +199,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -208,7 +208,7 @@ define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -217,7 +217,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -334,7 +334,7 @@ define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -343,7 +343,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)

>From bfcf1869e6595a8be3be7aa9aab371e56b8b86b4 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 13 Dec 2023 18:08:46 +0000
Subject: [PATCH 10/10] getVecDesc now prioritizes masked variant

Also further cleanup to address reviewers.
---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 85 ++++++++-----------
 ...e-intrinsics-with-veclib-sleef-scalable.ll |  2 -
 2 files changed, 36 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index ddcc55a8e52c40..4ea163e4eaafbb 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -69,20 +69,20 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
   return TLIFunc;
 }
 
-/// Replace the call to the vector intrinsic ( \p OldFunc ) with a call to the
-/// corresponding function from the vector library ( \p TLIFunc ).
-static bool replaceWithTLIFunction(const Module *M, CallInst &CI,
-                                   const ElementCount &VecVF, Function *OldFunc,
-                                   Function *TLIFunc, FunctionType *VecFTy,
-                                   bool IsMasked) {
+/// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to
+/// the corresponding function from the vector library ( \p TLIFunc ).
+static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
+                                   Function *TLIFunc, FunctionType *VecFTy) {
   IRBuilder<> IRBuilder(&CI);
   SmallVector<Value *> Args(CI.args());
-  if (IsMasked) {
+  if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
     if (Args.size() == VecFTy->getNumParams())
       static_assert(true && "mask was already in place");
 
-    auto *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VecVF);
-    Args.push_back(Constant::getAllOnesValue(MaskTy));
+    auto *MaskTy =
+        VectorType::get(Type::getInt1Ty(CI.getContext()), Info.Shape.VF);
+    Args.insert(Args.begin() + OptMaskpos.value(),
+                Constant::getAllOnesValue(MaskTy));
   }
 
   // Preserve the operand bundles.
@@ -95,26 +95,18 @@ static bool replaceWithTLIFunction(const Module *M, CallInst &CI,
   // Preserve fast math flags for FP math.
   if (isa<FPMathOperator>(Replacement))
     Replacement->copyFastMathFlags(&CI);
-
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `"
-                    << TLIFunc->getName() << "`.\n");
-  ++NumCallsReplaced;
-  return true;
 }
 
-/// Utility method to get the VecDesc, depending on whether there is a TLI
-/// mapping, either with or without a mask.
+/// Utility method to get the VecDesc, depending on whether there is such a TLI
+/// mapping, prioritizing a masked version.
 static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
                                                  const StringRef &ScalarName,
                                                  const ElementCount &VF) {
-  const VecDesc *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true);
-  const VecDesc *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false);
-  // Invalid when there are both variants (ie masked and unmasked), or none
-  if ((VDMasked == nullptr) == (VDNoMask == nullptr))
-    return std::nullopt;
-
-  return {VDMasked != nullptr ? VDMasked : VDNoMask};
+  if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true))
+    return VDMasked;
+  if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false))
+    return VDNoMask;
+  return std::nullopt;
 }
 
 /// Returns whether it is able to replace a call to the intrinsic \p CI with a
@@ -146,10 +138,9 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
         VF = VectorArgTy->getElementCount();
       else if (VF != VectorArgTy->getElementCount())
         return false;
-    } else {
+    } else
       // enters when it is supposed to be a vector argument but it isn't.
       return false;
-    }
   }
 
   // Try to reconstruct the name for the scalar version of this intrinsic using
@@ -164,26 +155,19 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   if (!TLI.isFunctionVectorizable(ScalarName))
     return false;
 
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo.
   auto OptVD = getVecDesc(TLI, ScalarName, VF);
   if (!OptVD)
     return false;
 
   const VecDesc *VD = *OptVD;
-  // Try to find the mapping for the scalar version of this intrinsic and the
-  // exact vector width of the call operands in the TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, VD->isMasked());
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
-                    << ScalarName << "` and vector width " << VF << ".\n");
-
-  // TLI failed to find a correct mapping.
-  if (TLIName.empty())
-    return false;
-
-  // Find the vector Function and replace the call to the intrinsic with a call
-  // to the vector library function.
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                    << "`.\n");
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName
+                    << "` and vector width " << VF << " to: `"
+                    << VD->getVectorFnName() << "`.\n");
 
+  // Replace the call to the intrinsic with a call to the vector library
+  // function.
   Type *ScalarRetTy = CI.getType()->getScalarType();
   FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false);
   const std::string MangledName = VD->getVectorFunctionABIVariantString();
@@ -191,17 +175,20 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   if (!OptInfo)
     return false;
 
-  // get the vector FunctionType
-  Module *M = CI.getModule();
-  auto OptFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
-  if (!OptFTy)
+  FunctionType *VectorFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
+  if (!VectorFTy)
     return false;
 
-  Function *OldFunc = CI.getCalledFunction();
-  FunctionType *VectorFTy = *OptFTy;
-  Function *TLIFunc = getTLIFunction(M, VectorFTy, OldFunc, TLIName);
-  return replaceWithTLIFunction(M, CI, OptInfo->Shape.VF, OldFunc, TLIFunc,
-                                VectorFTy, VD->isMasked());
+  Function *FuncToReplace = CI.getCalledFunction();
+  Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace,
+                                     VD->getVectorFnName());
+  replaceWithTLIFunction(CI, *OptInfo, TLIFunc, VectorFTy);
+
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << FuncToReplace->getName() << "` with call to `"
+                    << TLIFunc->getName() << "`.\n");
+  ++NumCallsReplaced;
+  return true;
 }
 
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index baf16f83a3e240..c2ff6014bc6944 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -3,8 +3,6 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors.
-
 ;.
 ; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata"
 ;.



More information about the llvm-commits mailing list