[llvm] [TLI] Pass replace-with-veclib works with Scalable Vectors. (PR #73642)

Tue Dec 19 08:36:31 PST 2023

================
@@ -38,138 +42,152 @@ STATISTIC(NumTLIFuncDeclAdded,
 STATISTIC(NumFuncUsedAdded,
           "Number of functions added to `llvm.compiler.used`");
 
-static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
-  Module *M = CI.getModule();
-
-  Function *OldFunc = CI.getCalledFunction();
-
-  // Check if the vector library function is already declared in this module,
-  // otherwise insert it.
+/// Returns a vector Function that it adds to the Module \p M. When an \p
+/// ScalarFunc is not null, it copies its attributes to the newly created
+/// Function.
+Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
+                         Function *ScalarFunc, const StringRef TLIName) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
-    TLIFunc = Function::Create(OldFunc->getFunctionType(),
-                               Function::ExternalLinkage, TLIName, *M);
-    TLIFunc->copyAttributesFrom(OldFunc);
+    TLIFunc =
+        Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M);
+    if (ScalarFunc)
+      TLIFunc->copyAttributesFrom(ScalarFunc);
 
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
                       << TLIName << "` of type `" << *(TLIFunc->getType())
                       << "` to module.\n");
 
     ++NumTLIFuncDeclAdded;
-
-    // Add the freshly created function to llvm.compiler.used,
-    // similar to as it is done in InjectTLIMappings
+    // Add the freshly created function to llvm.compiler.used, similar to as it
+    // is done in InjectTLIMappings.
     appendToCompilerUsed(*M, {TLIFunc});
-
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
                       << "` to `@llvm.compiler.used`.\n");
     ++NumFuncUsedAdded;
   }
+  return TLIFunc;
+}
 
-  // Replace the call to the vector intrinsic with a call
-  // to the corresponding function from the vector library.
+/// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to
+/// the corresponding function from the vector library ( \p TLIFunc ).
+static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
+                                   Function *TLIVecFunc) {
   IRBuilder<> IRBuilder(&CI);
   SmallVector<Value *> Args(CI.args());
+  if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
+    if (Args.size() == TLIVecFunc->getFunctionType()->getNumParams())
+      static_assert(true && "mask was already in place");
+
+    auto *MaskTy =
+        VectorType::get(Type::getInt1Ty(CI.getContext()), Info.Shape.VF);
+    Args.insert(Args.begin() + OptMaskpos.value(),
+                Constant::getAllOnesValue(MaskTy));
+  }
+
   // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
   CI.getOperandBundlesAsDefs(OpBundles);
-  CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
-  assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() &&
-         "Expecting function types to be identical");
+  CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles);
   CI.replaceAllUsesWith(Replacement);
-  if (isa<FPMathOperator>(Replacement)) {
-    // Preserve fast math flags for FP math.
+  // Preserve fast math flags for FP math.
+  if (isa<FPMathOperator>(Replacement))
     Replacement->copyFastMathFlags(&CI);
-  }
+}
 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `" << TLIName
-                    << "`.\n");
-  ++NumCallsReplaced;
-  return true;
+/// Utility method to get the VecDesc, depending on whether there is such a TLI
+/// mapping, prioritizing a masked version.
+static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
+                                                 const StringRef &ScalarName,
+                                                 const ElementCount &VF) {
+  if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false))
+    return VDNoMask;
+  if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true))
+    return VDMasked;
+  return std::nullopt;
 }
 
+/// Returns whether it is able to replace a call to the intrinsic \p CI with a
+/// TLI mapped call.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
                                     CallInst &CI) {
-  if (!CI.getCalledFunction()) {
+  if (!CI.getCalledFunction())
     return false;
-  }
 
   auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
-  if (IntrinsicID == Intrinsic::not_intrinsic) {
-    // Replacement is only performed for intrinsic functions
+  // Replacement is only performed for intrinsic functions.
+  if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
-  }
 
-  // Convert vector arguments to scalar type and check that
-  // all vector operands have identical vector width.
+  // Convert vector arguments to scalar type and check that all vector operands
+  // have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
-  SmallVector<Type *> ScalarTypes;
+  SmallVector<Type *> ScalarArgTypes;
   for (auto Arg : enumerate(CI.args())) {
-    auto *ArgType = Arg.value()->getType();
-    // Vector calls to intrinsics can still have
-    // scalar operands for specific arguments.
+    auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
-      ScalarTypes.push_back(ArgType);
-    } else {
-      // The argument in this place should be a vector if
-      // this is a call to a vector intrinsic.
-      auto *VectorArgTy = dyn_cast<VectorType>(ArgType);
-      if (!VectorArgTy) {
-        // The argument is not a vector, do not perform
-        // the replacement.
-        return false;
-      }
-      ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable()) {
-        // The current implementation does not support
-        // scalable vectors.
-        return false;
-      }
-      if (VF.isNonZero() && VF != NumElements) {
-        // The different arguments differ in vector size.
+      ScalarArgTypes.push_back(ArgTy);
+    } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
+      ScalarArgTypes.push_back(ArgTy->getScalarType());
+      // Disallow vector arguments with different VFs. When processing the first
+      // vector argument, store it's VF, and for the rest ensure that they match
+      // it.
+      if (VF.isZero())
+        VF = VectorArgTy->getElementCount();
+      else if (VF != VectorArgTy->getElementCount())
         return false;
-      } else {
-        VF = NumElements;
-      }
-      ScalarTypes.push_back(VectorArgTy->getElementType());
-    }
+    } else
+      // Exit when it is supposed to be a vector argument but it isn't.
+      return false;
   }
 
-  // Try to reconstruct the name for the scalar version of this
-  // intrinsic using the intrinsic ID and the argument types
-  // converted to scalar above.
-  std::string ScalarName;
-  if (Intrinsic::isOverloaded(IntrinsicID)) {
-    ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule());
-  } else {
-    ScalarName = Intrinsic::getName(IntrinsicID).str();
-  }
+  // Try to reconstruct the name for the scalar version of this intrinsic using
+  // the intrinsic ID and the argument types converted to scalar above.
+  std::string ScalarName =
+      (Intrinsic::isOverloaded(IntrinsicID)
+           ? Intrinsic::getName(IntrinsicID, ScalarArgTypes, CI.getModule())
+           : Intrinsic::getName(IntrinsicID).str());
 
-  if (!TLI.isFunctionVectorizable(ScalarName)) {
-    // The TargetLibraryInfo does not contain a vectorized version of
-    // the scalar function.
+  // The TargetLibraryInfo does not contain a vectorized version of the scalar
+  // function.
+  if (!TLI.isFunctionVectorizable(ScalarName))
     return false;
-  }
 
-  // Try to find the mapping for the scalar version of this intrinsic
-  // and the exact vector width of the call operands in the
-  // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF);
-
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
-                    << ScalarName << "` and vector width " << VF << ".\n");
-
-  if (!TLIName.empty()) {
-    // Found the correct mapping in the TargetLibraryInfo,
-    // replace the call to the intrinsic with a call to
-    // the vector library function.
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                      << "`.\n");
-    return replaceWithTLIFunction(CI, TLIName);
-  }
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo.
+  auto OptVD = getVecDesc(TLI, ScalarName, VF);
----------------
mgabka wrote:

IMO since this is used only once not using a function call would actually be more transparent here, and you wouldn't need to use the optional.

https://github.com/llvm/llvm-project/pull/73642