[clang] [llvm] [Vectorize][OpenMP] Support header-provided declare simd math variants (PR #189523)

Mon Mar 30 20:09:06 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: 王品呈 (wpcwzy)

<details>
<summary>Changes</summary>

This PR addresses an issue #120868 where the LLVM loop vectorizer fails to utilize SIMD counterparts of statndard math functions (such as acos, sin etc.) even when they are properly annotated with "#pragma omp declare simd" in system headers like glibc.

The failures were mostly caused by the following two components. First, clang emmitted LLVM intrinsics for some math functions, ignoring the "omp declare simd" metadata. Second, LoopAccessAnalysis and the cost model strictly rejected vectorization for any call marked "no-builtin", even if the call possessed an explicit vector variant via "vector-function-abi-variant" and can legally execute sequentially.

Changes contained in this PR are listed below:
- For clang codegen: Inhibits lowering to LLVM math intrinsics (e.g. llvm.acos.*) when an "OMPDeclareSimdDeclAttr" is present. Emits the mapping attribute using "VFABI::MappingAttrName" to the function declaration itself when appropriate.
- For LAA, LV & VFABIDemangler: Permits vectorizing calls that possess no-builtin so long as "VFDatabase::getMappings(*Call)" is not empty. Modifies VFABIDemangler to query the “Function”  for “vector-function-abi-variant” attributes when the “CallInst” misses them.
- For transform warnings: Stops emitting FailedRequestedVectorization remarks for OpenMP annotated parallel loops, avoiding false positives since unvectorized execution is a legal fallback. (glilbc uses -Werror option so this false postive could cause a failure in compiling.)

Regression tests for the suppresed warnings and the no-builtin variant mapping extraction are added.

Before submitting this PR, the LLVM unit tests, regression tests, llvm-test-suite and glibc tests are ran without failing, and compiling glibc with this patch enabled successfully emitted call instructions to glibc's libmvec functions, solving issue #120868 .

---
Full diff: https://github.com/llvm/llvm-project/pull/189523.diff


8 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+17) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+75) 
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+4-3) 
- (modified) llvm/lib/IR/VFABIDemangler.cpp (+5-1) 
- (modified) llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp (+27-21) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+1-1) 
- (added) llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll (+30) 
- (added) llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll (+51) 


``````````diff

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index da72a43643a54..23c89bb49a303 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -69,6 +69,19 @@ static bool shouldEmitBuiltinAsIR(unsigned BuiltinID,
   return false;
 }
 
+static bool shouldPreserveLibCallForDeclareSimd(const FunctionDecl *FD,
+                                                const LangOptions &LangOpts) {
+  if (!FD || !LangOpts.OpenMP)
+    return false;
+
+  for (const FunctionDecl *Redecl : FD->redecls()) {
+    if (Redecl->hasAttr<OMPDeclareSimdDeclAttr>())
+      return true;
+  }
+
+  return false;
+}
+
 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
                                         unsigned BuiltinID, const CallExpr *E,
                                         ReturnValueSlot ReturnValue,
@@ -2647,6 +2660,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           BuiltinID, CGM.getTriple(), ErrnoOverriden, getLangOpts().MathErrno,
           OptNone, IsOptimizationEnabled);
 
+  if (GenerateFPMathIntrinsics &&
+      shouldPreserveLibCallForDeclareSimd(FD, getLangOpts()))
+    GenerateFPMathIntrinsics = false;
+
   if (GenerateFPMathIntrinsics) {
     switch (BuiltinIDIfNoAsmLabel) {
     case Builtin::BIacos:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a8255ac74cfcf..338340602d30f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/VFABIDemangler.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -11311,6 +11312,75 @@ emitX86DeclareSimdFunction(const FunctionDecl *FD, llvm::Function *Fn,
   }
 }
 
+static void emitDeclareSimdVariantMetadata(CodeGenModule &CGM,
+                                           llvm::Function *Fn) {
+  llvm::SmallVector<std::string, 8> VariantMappings;
+  for (llvm::Attribute Attr : Fn->getAttributes().getFnAttrs()) {
+    if (!Attr.isStringAttribute())
+      continue;
+
+    StringRef VariantName = Attr.getKindAsString();
+    if (!VariantName.starts_with("_ZGV"))
+      continue;
+
+    std::optional<llvm::VFInfo> Info =
+        llvm::VFABI::tryDemangleForVFABI(VariantName, Fn->getFunctionType());
+    if (!Info)
+      continue;
+
+    llvm::FunctionType *VectorTy =
+        llvm::VFABI::createFunctionType(*Info, Fn->getFunctionType());
+    llvm::Function *VecFn = llvm::cast<llvm::Function>(
+        Fn->getParent()->getOrInsertFunction(Info->VectorName, VectorTy)
+            .getCallee());
+    CGM.addCompilerUsedGlobal(VecFn);
+    VariantMappings.push_back(std::string(VariantName));
+  }
+
+  if (VariantMappings.empty())
+    return;
+
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  for (const std::string &VariantName : VariantMappings)
+    Out << VariantName << ',';
+  Buffer.pop_back();
+
+  Fn->removeFnAttr(llvm::VFABI::MappingsAttrName);
+  Fn->addFnAttr(llvm::VFABI::MappingsAttrName, Buffer.str());
+}
+
+static bool shouldEmitDeclareSimdVariantMetadata(const FunctionDecl *FD,
+                                                 const SourceManager &SM) {
+  if (!FD)
+    return false;
+
+  for (const FunctionDecl *Redecl : FD->redecls()) {
+    if (!Redecl->hasAttr<OMPDeclareSimdDeclAttr>())
+      continue;
+
+    unsigned BuiltinID = Redecl->getBuiltinID();
+    if (BuiltinID &&
+      Redecl->getASTContext().BuiltinInfo.isLibFunction(BuiltinID))
+      return true;
+
+    SourceLocation Loc = SM.getExpansionLoc(Redecl->getLocation());
+    if (!Loc.isValid())
+      continue;
+
+    if (SM.isInSystemHeader(Loc))
+      return true;
+
+    // Keep support for library-style declarations coming from headers that are
+    // not marked as system headers (e.g. staged glibc build directories), but
+    // avoid enabling this for every non-main-file declaration.
+    if (!SM.isWrittenInMainFile(Loc) && Redecl->hasExternalFormalLinkage())
+      return true;
+  }
+
+  return false;
+}
+
 // This are the Functions that are needed to mangle the name of the
 // vector functions generated by the compiler, according to the rules
 // defined in the "Vector Function ABI specifications for AArch64",
@@ -11577,6 +11647,7 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
                                               llvm::Function *Fn) {
   ASTContext &C = CGM.getContext();
   FD = FD->getMostRecentDecl();
+  const FunctionDecl *MostRecentFD = FD;
   while (FD) {
     // Map params to their positions in function decl.
     llvm::DenseMap<const Decl *, unsigned> ParamPositions;
@@ -11724,6 +11795,10 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
     }
     FD = FD->getPreviousDecl();
   }
+
+  if (shouldEmitDeclareSimdVariantMetadata(
+          MostRecentFD, CGM.getContext().getSourceManager()))
+    emitDeclareSimdVariantMetadata(CGM, Fn);
 }
 
 namespace {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 5d88e5f54e3d6..15eee6fc2054c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2575,9 +2575,10 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
 
         // If the function has an explicit vectorized counterpart, and does not
         // take output/input pointers, we can safely assume that it can be
-        // vectorized.
-        if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
-            !hasPointerArgs(Call) && !VFDatabase::getMappings(*Call).empty())
+        // vectorized. This remains valid even when the scalar call is marked
+        // no-builtin, because the explicit mapping supplies the vector variant.
+        if (Call && Call->getCalledFunction() && !hasPointerArgs(Call) &&
+            !VFDatabase::getMappings(*Call).empty())
           continue;
 
         auto *Ld = dyn_cast<LoadInst>(&I);
diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp
index 4fcf43616d60c..ab213d3034446 100644
--- a/llvm/lib/IR/VFABIDemangler.cpp
+++ b/llvm/lib/IR/VFABIDemangler.cpp
@@ -534,7 +534,11 @@ VFParamKind VFABI::getVFParamKindFromString(const StringRef Token) {
 
 void VFABI::getVectorVariantNames(
     const CallInst &CI, SmallVectorImpl<std::string> &VariantMappings) {
-  const StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
+  StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
+  if (S.empty()) {
+    if (const Function *F = CI.getCalledFunction())
+      S = F->getFnAttribute(VFABI::MappingsAttrName).getValueAsString();
+  }
   if (S.empty())
     return;
 
diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index e53019768e881..a56459842914c 100644
--- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -46,28 +46,34 @@ static void warnAboutLeftoverTransformations(Loop *L,
   }
 
   if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
-    LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
-    std::optional<ElementCount> VectorizeWidth =
-        getOptionalElementCountLoopAttribute(L);
-    std::optional<int> InterleaveCount =
-        getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+    // OpenMP SIMD loops are represented as annotated parallel loops.  They can
+    // still legally execute without loop-vectorization, so avoid turning these
+    // cases into transform-warning diagnostics that are commonly promoted to
+    // hard errors by -Werror builds.
+    if (!L->isAnnotatedParallel()) {
+      LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
+      std::optional<ElementCount> VectorizeWidth =
+          getOptionalElementCountLoopAttribute(L);
+      std::optional<int> InterleaveCount =
+          getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
 
-    if (!VectorizeWidth || VectorizeWidth->isVector())
-      ORE->emit(
-          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
-                                            "FailedRequestedVectorization",
-                                            L->getStartLoc(), L->getHeader())
-          << "loop not vectorized: the optimizer was unable to perform the "
-             "requested transformation; the transformation might be disabled "
-             "or specified as part of an unsupported transformation ordering");
-    else if (InterleaveCount.value_or(0) != 1)
-      ORE->emit(
-          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
-                                            "FailedRequestedInterleaving",
-                                            L->getStartLoc(), L->getHeader())
-          << "loop not interleaved: the optimizer was unable to perform the "
-             "requested transformation; the transformation might be disabled "
-             "or specified as part of an unsupported transformation ordering");
+      if (!VectorizeWidth || VectorizeWidth->isVector())
+        ORE->emit(
+            DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                              "FailedRequestedVectorization",
+                                              L->getStartLoc(), L->getHeader())
+            << "loop not vectorized: the optimizer was unable to perform the "
+               "requested transformation; the transformation might be disabled "
+               "or specified as part of an unsupported transformation ordering");
+      else if (InterleaveCount.value_or(0) != 1)
+        ORE->emit(
+            DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                              "FailedRequestedInterleaving",
+                                              L->getStartLoc(), L->getHeader())
+            << "loop not interleaved: the optimizer was unable to perform the "
+               "requested transformation; the transformation might be disabled "
+               "or specified as part of an unsupported transformation ordering");
+    }
   }
 
   if (hasDistributeTransformation(L) == TM_ForcedByUser) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a63956c0cba6b..c2097691e45b9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5963,7 +5963,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         break;
       }
 
-      if (TLI && VecFunc && !CI->isNoBuiltin())
+      if (VecFunc)
         VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
 
       // Find the cost of an intrinsic; some targets may have instructions that
diff --git a/llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll b/llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll
new file mode 100644
index 0000000000000..6f7773ac034ae
--- /dev/null
+++ b/llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning < %s 2>&1 | FileCheck -allow-empty %s
+;
+; OpenMP SIMD loops are represented as annotated parallel loops. Keep
+; transform-warning from emitting FailedRequestedVectorization for these loops.
+;
+; CHECK-NOT: FailedRequestedVectorization
+; CHECK-NOT: loop not vectorized
+
+define void @test(ptr %x, ptr %c) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %next, %loop ]
+  %src.ptr = getelementptr inbounds double, ptr %c, i64 %i
+  %v = load double, ptr %src.ptr, align 8, !llvm.access.group !1
+  %dst.ptr = getelementptr inbounds double, ptr %x, i64 %i
+  store double %v, ptr %dst.ptr, align 8, !llvm.access.group !1
+  %next = add nuw nsw i64 %i, 1
+  %done = icmp eq i64 %next, 64
+  br i1 %done, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !2, !3}
+!1 = distinct !{}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!3 = !{!"llvm.loop.parallel_accesses", !1}
diff --git a/llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll b/llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll
new file mode 100644
index 0000000000000..78d843f5b0e0d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll
@@ -0,0 +1,51 @@
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
+
+; NOTE: This is a focused reproducer for OpenMP declare-simd style VFABI mapping.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @test_vector_abi() local_unnamed_addr #0 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src = getelementptr inbounds nuw double, ptr @c, i64 %iv
+  %v = load double, ptr %src, align 8, !tbaa !4, !llvm.access.group !8
+  %r = tail call double @acosh(double noundef %v) #2, !llvm.access.group !8
+  %dst = getelementptr inbounds nuw double, ptr @x, i64 %iv
+  store double %r, ptr %dst, align 8, !tbaa !4, !llvm.access.group !8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %done = icmp eq i64 %iv.next, 1000
+  br i1 %done, label %exit, label %loop, !llvm.loop !9
+
+exit:
+  ret i32 0
+}
+
+ at x = dso_local local_unnamed_addr global [1000 x double] zeroinitializer, align 16
+ at c = dso_local local_unnamed_addr global [1000 x double] zeroinitializer, align 16
+
+; CHECK-LABEL: @test_vector_abi(
+; CHECK: vector.body:
+; CHECK: call <2 x double> @_ZGVbN2v_acosh
+
+declare double @acosh(double noundef) local_unnamed_addr #1
+
+declare <2 x double> @_ZGVbN2v_acosh(<2 x double>)
+declare <4 x double> @_ZGVcN4v_acosh(<4 x double>)
+declare <4 x double> @_ZGVdN4v_acosh(<4 x double>)
+declare <8 x double> @_ZGVeN8v_acosh(<8 x double>)
+
+attributes #0 = { noinline nounwind strictfp }
+attributes #1 = { nounwind "_ZGVbN2v_acosh" "_ZGVcN4v_acosh" "_ZGVdN4v_acosh" "_ZGVeN8v_acosh" "no-builtins" "vector-function-abi-variant"="_ZGVbN2v_acosh,_ZGVcN4v_acosh,_ZGVdN4v_acosh,_ZGVeN8v_acosh" }
+attributes #2 = { nobuiltin nounwind strictfp "no-builtins" }
+
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{}
+!9 = distinct !{!9, !10, !11}
+!10 = !{!"llvm.loop.parallel_accesses", !8}
+!11 = !{!"llvm.loop.vectorize.enable", i1 true}

``````````

</details>


https://github.com/llvm/llvm-project/pull/189523