[flang-commits] [flang] [llvm] [mlir] [flang][mlir][OpenMP] Boost inline threshold for calls inside OpenMP SIMD loops (PR #195903)

Tue May 5 13:57:02 PDT 2026

https://github.com/chichunchen updated https://github.com/llvm/llvm-project/pull/195903

>From 2c12ea53c50028375046461cb2f53e22a3ab5381 Mon Sep 17 00:00:00 2001
From: "Chi Chun, Chen" <chichun.chen at hpe.com>
Date: Tue, 14 Apr 2026 18:23:13 -0500
Subject: [PATCH] [flang][mlir][OpenMP] Boost inline threshold for calls inside
 OpenMP SIMD loops

LLVM currently has no pass that generates vector function bodies (simd clones)
for OpenMP `declare simd` functions[1]. As as result, when a scalar function is
called inside an `!$omp simd` loop, LoopVectorize cannot vectorize the loop
because the call remains scalar.

This patch added a mlir pass (omp-simd-inline-boost) to mark function calls inside
`omp.simd` regions. The existing LLVM inliner uses that mark to increase inline
threshold so that LoopVectorize can widen the inlined scalar instructions naturally.

[1] https://discourse.llvm.org/t/rfc-aggressive-inlinging-for-openmp-simd-loops/90558

Assisted by Copilot.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 12 ++++
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  1 +
 flang/test/Fir/simd-inline-boost-codegen.fir  | 13 +++++
 .../Integration/OpenMP/simd-inline-boost.f90  | 38 +++++++++++++
 flang/test/Lower/OpenMP/host-eval.f90         |  4 +-
 llvm/include/llvm/Analysis/InlineCost.h       |  3 +
 llvm/lib/Analysis/InlineCost.cpp              |  5 ++
 .../Inline/inline-cost-attributes.ll          |  8 ++-
 .../mlir/Dialect/OpenMP/Transforms/Passes.td  | 11 ++++
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |  1 +
 .../Transforms/OpenMPSIMDInlineBoost.cpp      | 49 ++++++++++++++++
 .../Dialect/OpenMP/simd-inline-boost.mlir     | 56 +++++++++++++++++++
 12 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Fir/simd-inline-boost-codegen.fir
 create mode 100644 flang/test/Integration/OpenMP/simd-inline-boost.f90
 create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/OpenMPSIMDInlineBoost.cpp
 create mode 100644 mlir/test/Dialect/OpenMP/simd-inline-boost.mlir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 7d1068c25e7ca..c967a92d502a8 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -775,6 +775,18 @@ struct CallOpConversion : public fir::FIROpConversion<fir::CallOp> {
             call.getAccessGroups())
       llvmCall.setAccessGroups(*optionalAccessGroups);
 
+    // Boost inlining of calls inside OpenMP SIMD regions.
+    if (call->hasAttr("omp.simd_inline_boost")) {
+      mlir::NamedAttrList defaultFuncAttrs;
+      if (mlir::DictionaryAttr attrs = llvmCall.getDefaultFuncAttrsAttr())
+        defaultFuncAttrs.append(attrs.begin(), attrs.end());
+      defaultFuncAttrs.set("function-inline-threshold-bonus",
+                           rewriter.getStringAttr("2000"));
+      llvmCall.setDefaultFuncAttrsAttr(
+          defaultFuncAttrs.getDictionary(rewriter.getContext()));
+      llvmCall->removeAttr("omp.simd_inline_boost");
+    }
+
     if (memAttr)
       llvmCall.setMemoryEffectsAttr(
           mlir::cast<mlir::LLVM::MemoryEffectsAttr>(memAttr));
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 920d6f86a355e..77c58c8237a17 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -370,6 +370,7 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
   pm.addPass(flangomp::createAutomapToTargetDataPass());
   pm.addPass(flangomp::createMapInfoFinalizationPass());
   pm.addPass(mlir::omp::createMarkDeclareTargetPass());
+  pm.addPass(mlir::omp::createOpenMPSIMDInlineBoostPass());
 
   // Delete unreachable target operations before FunctionFilteringPass
   // extracts them.
diff --git a/flang/test/Fir/simd-inline-boost-codegen.fir b/flang/test/Fir/simd-inline-boost-codegen.fir
new file mode 100644
index 0000000000000..68d9a33ffbd0a
--- /dev/null
+++ b/flang/test/Fir/simd-inline-boost-codegen.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s
+
+module {
+  func.func private @foo()
+
+  func.func @test_merge_default_func_attrs() {
+    // CHECK-LABEL: llvm.func @test_merge_default_func_attrs
+    // CHECK: llvm.call @foo() {default_func_attrs = {existing = "1", "function-inline-threshold-bonus" = "2000"}} : () -> ()
+    // CHECK-NOT: omp.simd_inline_boost
+    fir.call @foo() {default_func_attrs = {existing = "1"}, omp.simd_inline_boost} : () -> ()
+    return
+  }
+}
diff --git a/flang/test/Integration/OpenMP/simd-inline-boost.f90 b/flang/test/Integration/OpenMP/simd-inline-boost.f90
new file mode 100644
index 0000000000000..28000b2583bd6
--- /dev/null
+++ b/flang/test/Integration/OpenMP/simd-inline-boost.f90
@@ -0,0 +1,38 @@
+! Test that function calls inside !$omp simd loops get boosted inline thresholds.
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+! CHECK-LABEL: define {{.*}} @test_simd_
+subroutine test_simd(x, n)
+  implicit none
+  integer, intent(in) :: n
+  real, intent(inout) :: x(n)
+  integer :: i
+  interface
+    real function foo(v)
+      real, intent(in) :: v
+    end function
+  end interface
+  !$omp simd
+  do i = 1, n
+    ! CHECK: call {{.*}}@foo_({{.*}}) #[[BOOST:[0-9]+]]
+    x(i) = foo(x(i))
+  end do
+  !$omp end simd
+end subroutine
+
+! Calls outside !$omp simd should NOT get the attribute.
+! CHECK-LABEL: define {{.*}} @no_simd_
+subroutine no_simd(x)
+  implicit none
+  real, intent(inout) :: x
+  interface
+    real function foo(v)
+      real, intent(in) :: v
+    end function
+  end interface
+  ! CHECK: call {{.*}}@foo_({{.*}})
+  ! CHECK-NOT: call {{.*}}@foo_({{.*}}) #[[BOOST]]
+  x = foo(x)
+end subroutine
+
+! CHECK: attributes #[[BOOST]] = {{{.*}}"function-inline-threshold-bonus"="2000"{{.*}}}
diff --git a/flang/test/Lower/OpenMP/host-eval.f90 b/flang/test/Lower/OpenMP/host-eval.f90
index 7a9c08895189d..96a6b5e00d630 100644
--- a/flang/test/Lower/OpenMP/host-eval.f90
+++ b/flang/test/Lower/OpenMP/host-eval.f90
@@ -160,7 +160,7 @@ subroutine distribute_parallel_do_simd()
   ! DEVICE-NOT: omp.parallel
   ! DEVICE-NOT: omp.distribute
   ! DEVICE-NOT: omp.wsloop
-  ! DEVICE-NOT: omp.simd
+  ! DEVICE-NOT: {{^ *}}omp.simd{{[ {]}}
   !$omp distribute parallel do simd num_threads(1)
   do i=1,10
     call foo()
@@ -269,7 +269,7 @@ subroutine distribute_simd()
   ! HOST-NEXT: omp.simd
 
   ! DEVICE-NOT: omp.distribute
-  ! DEVICE-NOT: omp.simd
+  ! DEVICE-NOT: {{^ *}}omp.simd{{[ {]}}
   !$omp distribute simd
   do i=1,10
     call foo()
diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index 1faf480c590ac..4ab50b6ba75cc 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -60,6 +60,9 @@ const uint64_t MaxSimplifiedDynamicAllocaToInline = 65536;
 const char FunctionInlineCostMultiplierAttributeName[] =
     "function-inline-cost-multiplier";
 
+const char FunctionInlineThresholdBonusAttributeName[] =
+    "function-inline-threshold-bonus";
+
 const char MaxInlineStackSizeAttributeName[] = "inline-max-stacksize";
 } // namespace InlineConstants
 
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index d975a93e9b1fd..fb1163fb24d31 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -1120,6 +1120,11 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
             getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
       Threshold = *AttrThreshold;
 
+    if (std::optional<int> AttrThresholdBonus = getStringFnAttrAsInt(
+            CandidateCall,
+            InlineConstants::FunctionInlineThresholdBonusAttributeName))
+      Threshold += *AttrThresholdBonus;
+
     if (auto Result = costBenefitAnalysis()) {
       DecidedByCostBenefit = true;
       if (*Result)
diff --git a/llvm/test/Transforms/Inline/inline-cost-attributes.ll b/llvm/test/Transforms/Inline/inline-cost-attributes.ll
index 71264ab6c389f..49872d19a9fe5 100644
--- a/llvm/test/Transforms/Inline/inline-cost-attributes.ll
+++ b/llvm/test/Transforms/Inline/inline-cost-attributes.ll
@@ -11,12 +11,15 @@ entry:
 
 define void @fn2() "function-inline-threshold"="41" {
 ; INLINER-LABEL: Inlining calls in: fn2
-; INLINER-NEXT: Function size: 7
+; INLINER-NEXT: Function size: 8
 ; INLINER-NEXT: NOT Inlining (cost=321, threshold=123), Call:   call void @fn1()
 ; INLINER-NEXT: NOT Inlining (cost=963, threshold=123), Call:   call void @fn1()
 ; INLINER-NEXT: NOT Inlining (cost=321, threshold=321), Call:   call void @fn1()
 ; INLINER-NEXT: NOT Inlining (cost=197, threshold=123), Call:   call void @fn1()
 ; INLINER-NEXT: Inlining (cost=197, threshold=321), Call:   call void @fn1()
+; INLINER-NEXT: Size after inlining: 7
+; INLINER-NEXT: Inlining (cost=321, threshold=523), Call:   call void @fn1()
+; INLINER-NEXT: Size after inlining: 6
 
 ; COST-LABEL: define void @fn2()
 ; COST-NEXT: entry:
@@ -32,6 +35,8 @@ define void @fn2() "function-inline-threshold"="41" {
 ; COST-NEXT: call void @fn1()
 ; COST-NEXT: cost delta = 473
 ; COST-NEXT: call void @fn1()
+; COST-NEXT: cost delta = 271
+; COST-NEXT: call void @fn1()
 
 entry:
   call void @extern()
@@ -40,6 +45,7 @@ entry:
   call void @fn1() "call-inline-cost"="0" "function-inline-threshold"="321"
   call void @fn1() "call-threshold-bonus"="17" "function-inline-cost"="197"
   call void @fn1() "call-inline-cost"="473" "function-inline-cost"="197" "function-inline-threshold"="321"
+  call void @fn1() "function-inline-threshold-bonus"="400"
   ret void
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
index e6321ef58b45f..3755c262eca41 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -52,4 +52,15 @@ def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> {
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
+def OpenMPSIMDInlineBoostPass : Pass<"omp-simd-inline-boost", "ModuleOp"> {
+  let summary = "Boost inline threshold for calls inside OpenMP SIMD loops";
+  let description = [{
+    Marks function calls inside omp.simd regions with a discardable attribute
+    (omp.simd_inline_boost) so that the FIR-to-LLVM conversion can set
+    "function-inline-threshold-bonus" on the resulting llvm.call. This enables
+    aggressive inlining of scalar function calls inside SIMD loops, allowing
+    LoopVectorize to vectorize the inlined loop body.
+  }];
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
 #endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
index 569786fe95cf3..a916fe257cc98 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_dialect_library(MLIROpenMPTransforms
   MarkDeclareTarget.cpp
   OpenMPOffloadPrivatizationPrepare.cpp
   StackToShared.cpp
+  OpenMPSIMDInlineBoost.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPSIMDInlineBoost.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPSIMDInlineBoost.cpp
new file mode 100644
index 0000000000000..f4b2dc39338ae
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPSIMDInlineBoost.cpp
@@ -0,0 +1,49 @@
+//===- OpenMPSIMDInlineBoost.cpp
+//-------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Mark function calls inside OpenMP SIMD regions with omp.simd_inline_boost
+// so FIR-to-LLVM conversion can add an LLVM inline-threshold bonus, enabling
+// more aggressive inlining for vectorization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace omp {
+
+#define GEN_PASS_DEF_OPENMPSIMDINLINEBOOSTPASS
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+} // namespace omp
+} // namespace mlir
+
+using namespace mlir;
+namespace {
+
+class OpenMPSIMDInlineBoostPass
+    : public omp::impl::OpenMPSIMDInlineBoostPassBase<
+          OpenMPSIMDInlineBoostPass> {
+
+  void runOnOperation() override {
+    getOperation()->walk([](omp::SimdOp simdOp) {
+      simdOp->walk([](CallOpInterface callOp) {
+        Operation *op = callOp.getOperation();
+        if (op->hasAttr("omp.simd_inline_boost"))
+          return;
+        op->setAttr("omp.simd_inline_boost", UnitAttr::get(op->getContext()));
+      });
+    });
+  }
+};
+
+} // namespace
diff --git a/mlir/test/Dialect/OpenMP/simd-inline-boost.mlir b/mlir/test/Dialect/OpenMP/simd-inline-boost.mlir
new file mode 100644
index 0000000000000..80d19a87e1378
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/simd-inline-boost.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt -omp-simd-inline-boost %s | FileCheck %s
+
+func.func private @callee(%arg0: f32) -> f32
+
+// CHECK-LABEL: func.func @simd_with_call
+func.func @simd_with_call(%lb: index, %ub: index, %step: index, %a: memref<?xf32>) {
+  omp.simd {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      %val = memref.load %a[%iv] : memref<?xf32>
+      // CHECK: func.call @callee(%{{.*}}) {omp.simd_inline_boost} : (f32) -> f32
+      %res = func.call @callee(%val) : (f32) -> f32
+      memref.store %res, %a[%iv] : memref<?xf32>
+      omp.yield
+    }
+  }
+  return
+}
+
+// Calls outside omp.simd should NOT be modified.
+// CHECK-LABEL: func.func @no_simd
+func.func @no_simd(%v: f32) -> f32 {
+  // CHECK: call @callee(%{{.*}}) : (f32) -> f32
+  // CHECK-NOT: omp.simd_inline_boost
+  %res = func.call @callee(%v) : (f32) -> f32
+  return %res : f32
+}
+
+// Composite wsloop+simd: calls inside omp.simd should be boosted.
+// CHECK-LABEL: func.func @wsloop_simd_with_call
+func.func @wsloop_simd_with_call(%lb: index, %ub: index, %step: index, %a: memref<?xf32>) {
+  omp.wsloop {
+    omp.simd {
+      omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+        %val = memref.load %a[%iv] : memref<?xf32>
+        // CHECK: func.call @callee(%{{.*}}) {omp.simd_inline_boost} : (f32) -> f32
+        %res = func.call @callee(%val) : (f32) -> f32
+        memref.store %res, %a[%iv] : memref<?xf32>
+        omp.yield
+      }
+    } {omp.composite}
+  } {omp.composite}
+  return
+}
+
+// Calls already marked should not be re-marked (idempotent).
+// CHECK-LABEL: func.func @already_marked
+func.func @already_marked(%lb: index, %ub: index, %step: index, %v: f32) {
+  omp.simd {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      // CHECK: func.call @callee(%{{.*}}) {omp.simd_inline_boost} : (f32) -> f32
+      %res = func.call @callee(%v) {omp.simd_inline_boost} : (f32) -> f32
+      omp.yield
+    }
+  }
+  return
+}