[clang] [flang] [llvm] [flang] Add runtime trampoline pool for W^X compliance (PR #183108)
Sairudra More via cfe-commits
cfe-commits at lists.llvm.org
Wed Mar 4 13:00:37 PST 2026
https://github.com/Saieiei updated https://github.com/llvm/llvm-project/pull/183108
>From a866872e85a28dc20bacdd1d071c96564489a54c Mon Sep 17 00:00:00 2001
From: Sairudra More <moresair at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 24 Feb 2026 11:21:17 -0600
Subject: [PATCH] [flang] Add runtime trampoline pool for W^X compliance
Replace LLVM's llvm.init.trampoline / llvm.adjust.trampoline intrinsics
with a runtime-managed trampoline pool that keeps the stack non-executable,
satisfying W^X (Write XOR Execute) policies enforced by modern toolchains
and security-hardened kernels.
The new -fenable-runtime-trampoline flag activates this path in
BoxedProcedure, which emits calls to _FortranATrampolineInit,
_FortranATrampolineAdjust, and _FortranATrampolineFree instead of
the legacy LLVM trampoline intrinsics.
Closes #182813
---
clang/include/clang/Options/Options.td | 5 +
clang/lib/Driver/ToolChains/Flang.cpp | 14 +
.../include/flang-rt/runtime/trampoline.h | 66 +++
flang-rt/lib/runtime/CMakeLists.txt | 1 +
flang-rt/lib/runtime/trampoline.cpp | 435 ++++++++++++++
.../test/Driver/safe-trampoline-gnustack.f90 | 45 ++
flang/docs/RuntimeEnvironment.md | 13 +
.../include/flang/Frontend/CodeGenOptions.def | 1 +
.../Optimizer/Builder/Runtime/RTBuilder.h | 4 +
.../Optimizer/Builder/Runtime/Trampoline.h | 47 ++
.../flang/Optimizer/CodeGen/CGPasses.td | 16 +-
.../flang/Optimizer/Passes/CommandLineOpts.h | 1 +
.../flang/Optimizer/Passes/Pipelines.h | 3 +-
flang/include/flang/Runtime/trampoline.h | 69 +++
flang/include/flang/Tools/CrossToolHelpers.h | 2 +
flang/lib/Frontend/CompilerInvocation.cpp | 4 +
flang/lib/Optimizer/Builder/CMakeLists.txt | 1 +
.../Optimizer/Builder/Runtime/Trampoline.cpp | 49 ++
.../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 535 ++++++++++++------
.../lib/Optimizer/Passes/CommandLineOpts.cpp | 2 +
flang/lib/Optimizer/Passes/Pipelines.cpp | 15 +-
flang/test/Driver/fsafe-trampoline.f90 | 13 +
flang/test/Fir/boxproc-safe-trampoline.fir | 99 ++++
flang/test/Lower/safe-trampoline.f90 | 126 +++++
24 files changed, 1368 insertions(+), 198 deletions(-)
create mode 100644 flang-rt/include/flang-rt/runtime/trampoline.h
create mode 100644 flang-rt/lib/runtime/trampoline.cpp
create mode 100644 flang-rt/test/Driver/safe-trampoline-gnustack.f90
create mode 100644 flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h
create mode 100644 flang/include/flang/Runtime/trampoline.h
create mode 100644 flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp
create mode 100644 flang/test/Driver/fsafe-trampoline.f90
create mode 100644 flang/test/Fir/boxproc-safe-trampoline.fir
create mode 100644 flang/test/Lower/safe-trampoline.f90
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 5c0a6e3c913ea..9eafa089303a3 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7588,6 +7588,11 @@ defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays",
PosFlag<SetTrue, [], [ClangOption], "Attempt to allocate array temporaries on the stack, no matter their size">,
NegFlag<SetFalse, [], [ClangOption], "Allocate array temporaries on the heap (default)">>;
+defm safe_trampoline : BoolOptionWithoutMarshalling<"f",
+ "safe-trampoline",
+ PosFlag<SetTrue, [], [FlangOption], "Use W^X compliant runtime trampoline pool for internal procedures">,
+ NegFlag<SetFalse, [], [FlangOption], "Use stack-based trampolines for internal procedures (default, may require executable stack)">>;
+
defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stride",
PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">,
NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops (default)">>;
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 8425f8fec62a4..053886983670c 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -203,6 +203,20 @@ void Flang::addCodegenOptions(const ArgList &Args,
!stackArrays->getOption().matches(options::OPT_fno_stack_arrays))
CmdArgs.push_back("-fstack-arrays");
+ if (Args.hasFlag(options::OPT_fsafe_trampoline,
+ options::OPT_fno_safe_trampoline, false)) {
+ const llvm::Triple &T = getToolChain().getTriple();
+ if (T.getArch() == llvm::Triple::x86_64 ||
+ T.getArch() == llvm::Triple::aarch64 ||
+ T.getArch() == llvm::Triple::aarch64_be) {
+ CmdArgs.push_back("-fsafe-trampoline");
+ } else {
+ getToolChain().getDriver().Diag(
+ diag::warn_drv_unsupported_option_for_target)
+ << "-fsafe-trampoline" << T.str();
+ }
+ }
+
// -fno-protect-parens is the default for -Ofast.
if (!Args.hasFlag(options::OPT_fprotect_parens,
options::OPT_fno_protect_parens,
diff --git a/flang-rt/include/flang-rt/runtime/trampoline.h b/flang-rt/include/flang-rt/runtime/trampoline.h
new file mode 100644
index 0000000000000..5320c8161bf68
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/trampoline.h
@@ -0,0 +1,66 @@
+//===-- flang-rt/runtime/trampoline.h ----------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Internal declarations for the W^X-compliant trampoline pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FLANG_RT_RUNTIME_TRAMPOLINE_H_
+#define FLANG_RT_RUNTIME_TRAMPOLINE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace Fortran::runtime::trampoline {
+
+/// Per-trampoline data entry. Stored in a writable (non-executable) region.
+/// Each entry is paired with a trampoline code stub in the executable region.
+struct TrampolineData {
+ const void *calleeAddress{nullptr};
+ const void *staticChainAddress{nullptr};
+};
+
+/// Default number of trampoline slots in the pool.
+/// Can be overridden via FLANG_TRAMPOLINE_POOL_SIZE environment variable.
+constexpr std::size_t kDefaultPoolSize{1024};
+
+/// Size of each trampoline code stub in bytes (platform-specific).
+#if defined(__x86_64__) || defined(_M_X64)
+// x86-64 trampoline stub:
+// movq TDATA_OFFSET(%rip), %r10 # load static chain from TDATA
+// movabsq $0, %r11 # placeholder for callee address
+// jmpq *%r11
+// Actually we use an indirect approach through the TDATA pointer:
+// movq (%r10), %r10 # load static chain (8 bytes)
+// -- but we need the TDATA pointer first
+// Simplified approach for x86-64:
+// leaq tdata_entry(%rip), %r11 # get TDATA entry address
+// movq 8(%r11), %r10 # load static chain
+// jmpq *(%r11) # jump to callee
+constexpr std::size_t kTrampolineStubSize{32};
+constexpr int kNestRegister{10}; // %r10 is the nest/static chain register
+#elif defined(__aarch64__) || defined(_M_ARM64)
+// AArch64 trampoline stub:
+// adr x17, tdata_entry # get TDATA entry address
+// ldr x15, [x17, #8] # load static chain into x15 (nest reg)
+// ldr x17, [x17] # load callee address
+// br x17
+constexpr std::size_t kTrampolineStubSize{32};
+constexpr int kNestRegister{15}; // x15 is the nest / static-chain register
+#elif defined(__powerpc64__) || defined(__ppc64__)
+constexpr std::size_t kTrampolineStubSize{48};
+constexpr int kNestRegister{11}; // r11
+#else
+// Fallback: generous size
+constexpr std::size_t kTrampolineStubSize{64};
+constexpr int kNestRegister{0};
+#endif
+
+} // namespace Fortran::runtime::trampoline
+
+#endif // FLANG_RT_RUNTIME_TRAMPOLINE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 9fa8376e9b99c..d5e89a169255c 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -88,6 +88,7 @@ set(host_sources
stop.cpp
temporary-stack.cpp
time-intrinsic.cpp
+ trampoline.cpp
unit-map.cpp
)
if (TARGET llvm-libc-common-utilities)
diff --git a/flang-rt/lib/runtime/trampoline.cpp b/flang-rt/lib/runtime/trampoline.cpp
new file mode 100644
index 0000000000000..8f7e97ce62166
--- /dev/null
+++ b/flang-rt/lib/runtime/trampoline.cpp
@@ -0,0 +1,435 @@
+//===-- lib/runtime/trampoline.cpp -------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// W^X-compliant trampoline pool implementation.
+//
+// This file implements a runtime trampoline pool that maintains separate
+// memory regions for executable code (RX) and writable data (RW).
+//
+// On Linux the code region transitions RW → RX (never simultaneously W+X).
+// On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X
+// toggling via pthread_jit_write_protect_np, so the mapping permissions
+// include both W and X but hardware enforces that only one is active at
+// a time on any given thread.
+//
+// Architecture:
+// - Code region (RX): Contains pre-assembled trampoline stubs that load
+// callee address and static chain from a paired TDATA entry, then jump
+// to the callee with the static chain in the appropriate register.
+// - Data region (RW): Contains TrampolineData entries with {callee_address,
+// static_chain_address} pairs, one per trampoline slot.
+// - Free list: Tracks available trampoline slots for O(1) alloc/free.
+//
+// Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX,
+// CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime
+// library dependence. A single global lock serializes pool operations.
+// This is a deliberate V1 design choice to keep the initial W^X
+// architectural change minimal. Per-thread lock-free pools are deferred
+// to a future optimization patch.
+//
+// AddressSanitizer note: The trampoline code region is allocated via
+// mmap (not malloc/new), so ASan does not track it. The data region
+// and handles are allocated via malloc (through AllocateMemoryOrCrash),
+// which ASan intercepts normally. No special annotations are needed.
+//
+// See flang/docs/InternalProcedureTrampolines.md for design details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/trampoline.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/trampoline.h"
+#include "flang/Runtime/freestanding-tools.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+// Platform-specific headers for memory mapping.
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+// macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np
+// to create executable memory under the hardened runtime.
+#if defined(__APPLE__) && defined(__aarch64__)
+#include <libkern/OSCacheControl.h>
+#include <pthread.h>
+#endif
+
+// Architecture support check. Stub generators exist only for x86-64 and
+// AArch64. On other architectures the file compiles but the runtime API
+// functions crash with a diagnostic if actually called, so that building
+// flang-rt on e.g. RISC-V or PPC64 never fails.
+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \
+ defined(_M_ARM64)
+#define TRAMPOLINE_ARCH_SUPPORTED 1
+#else
+#define TRAMPOLINE_ARCH_SUPPORTED 0
+#endif
+
+namespace Fortran::runtime::trampoline {
+
+/// A handle returned to the caller. Contains enough info to find
+/// both the trampoline stub and its data entry.
+struct TrampolineHandle {
+ void *codePtr{nullptr}; // Pointer to the trampoline stub in the RX region.
+ std::size_t slotIndex{0}; // Index in the pool for free-list management.
+};
+
+// Namespace-scope globals following Flang runtime conventions:
+// - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION)
+// - Pool pointer uses std::atomic for safe double-checked locking
+class TrampolinePool; // Forward declaration for pointer below.
+static Lock poolLock;
+static std::atomic<TrampolinePool *> poolInstance{nullptr};
+
+/// The global trampoline pool.
+class TrampolinePool {
+public:
+ TrampolinePool() = default;
+
+ static TrampolinePool &instance() {
+ TrampolinePool *p{poolInstance.load(std::memory_order_acquire)};
+ if (p) {
+ return *p;
+ }
+ CriticalSection critical{poolLock};
+ p = poolInstance.load(std::memory_order_relaxed);
+ if (p) {
+ return *p;
+ }
+ // Allocate pool using SizedNew (malloc + placement new).
+ Terminator terminator{__FILE__, __LINE__};
+ auto owning{SizedNew<TrampolinePool>{terminator}(sizeof(TrampolinePool))};
+ p = owning.release();
+ poolInstance.store(p, std::memory_order_release);
+ return *p;
+ }
+
+ /// Allocate a trampoline slot and initialize it.
+ TrampolineHandle *allocate(
+ const void *calleeAddress, const void *staticChainAddress) {
+ CriticalSection critical{lock_};
+ ensureInitialized();
+
+ if (freeHead_ == kInvalidIndex) {
+ // Pool exhausted — fixed size by design for V1.
+ // The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE
+ // (default 1024). Dynamic slab growth can be added in a follow-up
+ // patch if real workloads demonstrate a need for it.
+ Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Trampoline pool exhausted (max %zu slots). "
+ "Set FLANG_TRAMPOLINE_POOL_SIZE to increase.",
+ poolSize_);
+ }
+
+ std::size_t index{freeHead_};
+ freeHead_ = freeList_[index];
+
+ // Initialize the data entry.
+ dataRegion_[index].calleeAddress = calleeAddress;
+ dataRegion_[index].staticChainAddress = staticChainAddress;
+
+ // Create handle using SizedNew (malloc + placement new).
+ Terminator terminator{__FILE__, __LINE__};
+ auto owning{New<TrampolineHandle>{terminator}()};
+ TrampolineHandle *handle{owning.release()};
+ handle->codePtr =
+ static_cast<char *>(codeRegion_) + index * kTrampolineStubSize;
+ handle->slotIndex = index;
+
+ return handle;
+ }
+
+ /// Get the callable address of a trampoline.
+ void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr; }
+
+ /// Free a trampoline slot.
+ void free(TrampolineHandle *handle) {
+ CriticalSection critical{lock_};
+
+ std::size_t index{handle->slotIndex};
+
+ // Poison the data entry so that any dangling call through a freed
+ // trampoline traps immediately. Setting to NULL means the stub will
+ // jump to address 0, which is unmapped on all supported platforms
+ // and produces SIGSEGV/SIGBUS immediately.
+ dataRegion_[index].calleeAddress = nullptr;
+ dataRegion_[index].staticChainAddress = nullptr;
+
+ // Return slot to free list.
+ freeList_[index] = freeHead_;
+ freeHead_ = index;
+
+ FreeMemory(handle);
+ }
+
+private:
+ static constexpr std::size_t kInvalidIndex{~std::size_t{0}};
+
+ void ensureInitialized() {
+ if (initialized_) {
+ return;
+ }
+ initialized_ = true;
+
+ // Check environment variable for pool size override.
+ // Fixed-size pool by design (V1): avoids complexity of dynamic growth
+ // and re-protection of code pages. The default (1024 slots) is
+ // sufficient for typical Fortran programs. Users can override via:
+ // export FLANG_TRAMPOLINE_POOL_SIZE=4096
+ poolSize_ = kDefaultPoolSize;
+ if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) {
+ long val{std::strtol(envSize, nullptr, 10)};
+ if (val > 0) {
+ poolSize_ = static_cast<std::size_t>(val);
+ }
+ }
+
+ // Allocate the data region (RW).
+ Terminator terminator{__FILE__, __LINE__};
+ dataRegion_ = static_cast<TrampolineData *>(
+ AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(TrampolineData)));
+ runtime::memset(dataRegion_, 0, poolSize_ * sizeof(TrampolineData));
+
+ // Allocate the code region (initially RW for writing stubs, then RX).
+ std::size_t codeSize{poolSize_ * kTrampolineStubSize};
+#if defined(_WIN32)
+ codeRegion_ = VirtualAlloc(
+ nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+#elif defined(__APPLE__) && defined(__aarch64__)
+ // macOS Apple Silicon: MAP_JIT is required for pages that will become
+ // executable. Use pthread_jit_write_protect_np to toggle W↔X.
+ codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0);
+ if (codeRegion_ == MAP_FAILED) {
+ codeRegion_ = nullptr;
+ }
+ if (codeRegion_) {
+ // Enable writing on this thread (MAP_JIT defaults to execute).
+ pthread_jit_write_protect_np(0); // 0 = writable
+ }
+#else
+ codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (codeRegion_ == MAP_FAILED) {
+ codeRegion_ = nullptr;
+ }
+#endif
+ if (!codeRegion_) {
+ terminator.Crash("Failed to allocate trampoline code region");
+ }
+
+ // Generate trampoline stubs.
+ generateStubs();
+
+ // Flush instruction cache. Required on architectures with non-coherent
+ // I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op
+ // but harmless. Without this, AArch64 may execute stale instructions.
+#if defined(__APPLE__) && defined(__aarch64__)
+ // On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h).
+ sys_icache_invalidate(codeRegion_, codeSize);
+#elif defined(_WIN32)
+ FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize);
+#else
+ __builtin___clear_cache(static_cast<char *>(codeRegion_),
+ static_cast<char *>(codeRegion_) + codeSize);
+#endif
+
+ // Make code region executable and non-writable (W^X).
+#if defined(_WIN32)
+ DWORD oldProtect;
+ VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect);
+#elif defined(__APPLE__) && defined(__aarch64__)
+ // Switch back to execute-only (MAP_JIT manages per-thread W^X).
+ pthread_jit_write_protect_np(1); // 1 = executable
+#else
+ mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC);
+#endif
+
+ // Initialize free list.
+ freeList_ = static_cast<std::size_t *>(
+ AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(std::size_t)));
+
+ for (std::size_t i{0}; i < poolSize_ - 1; ++i) {
+ freeList_[i] = i + 1;
+ }
+ freeList_[poolSize_ - 1] = kInvalidIndex;
+ freeHead_ = 0;
+ }
+
+ /// Generate platform-specific trampoline stubs in the code region.
+ /// Each stub loads callee address and static chain from its paired
+ /// TDATA entry and jumps to the callee.
+ void generateStubs() {
+#if defined(__x86_64__) || defined(_M_X64)
+ generateStubsX86_64();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+ generateStubsAArch64();
+#else
+ // Unsupported architecture — should never be reached because the
+ // extern "C" API functions guard with TRAMPOLINE_ARCH_SUPPORTED.
+ // Fill with trap bytes as a safety net.
+ runtime::memset(codeRegion_, 0, poolSize_ * kTrampolineStubSize);
+#endif
+ }
+
+#if defined(__x86_64__) || defined(_M_X64)
+ /// Generate x86-64 trampoline stubs.
+ ///
+ /// Each stub does:
+ /// movabsq $dataEntry, %r11 ; load TDATA entry address
+ /// movq 8(%r11), %r10 ; load static chain -> nest register
+ /// jmpq *(%r11) ; jump to callee address
+ ///
+ /// Total: 10 + 4 + 3 = 17 bytes, padded to kTrampolineStubSize.
+ void generateStubsX86_64() {
+ auto *code{static_cast<uint8_t *>(codeRegion_)};
+
+ for (std::size_t i{0}; i < poolSize_; ++i) {
+ uint8_t *stub{code + i * kTrampolineStubSize};
+
+ // Address of the corresponding TDATA entry.
+ auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])};
+
+ std::size_t off{0};
+
+ // movabsq $dataAddr, %r11 (REX.W + B, opcode 0xBB for r11)
+ stub[off++] = 0x49; // REX.WB
+ stub[off++] = 0xBB; // MOV r11, imm64
+ runtime::memcpy(&stub[off], &dataAddr, 8);
+ off += 8;
+
+ // movq 8(%r11), %r10 (load staticChainAddress into r10)
+ stub[off++] = 0x4D; // REX.WRB
+ stub[off++] = 0x8B; // MOV r/m64 -> r64
+ stub[off++] = 0x53; // ModRM: [r11 + disp8], r10
+ stub[off++] = 0x08; // disp8 = 8
+
+ // jmpq *(%r11) (jump to calleeAddress)
+ stub[off++] = 0x41; // REX.B
+ stub[off++] = 0xFF; // JMP r/m64
+ stub[off++] = 0x23; // ModRM: [r11], opcode extension 4
+
+ // Pad the rest with INT3 (0xCC) for safety.
+ while (off < kTrampolineStubSize) {
+ stub[off++] = 0xCC;
+ }
+ }
+ }
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /// Generate AArch64 trampoline stubs.
+ ///
+ /// Each stub does:
+ /// ldr x17, .Ldata_addr ; load TDATA entry address
+ /// ldr x15, [x17, #8] ; load static chain -> x15 (nest reg)
+ /// ldr x17, [x17] ; load callee address
+ /// br x17 ; jump to callee
+ /// .Ldata_addr:
+ /// .quad <address of dataRegion_[i]>
+ ///
+ /// Total: 4*4 + 8 = 24 bytes, padded to kTrampolineStubSize.
+ void generateStubsAArch64() {
+ auto *code{static_cast<uint8_t *>(codeRegion_)};
+
+ for (std::size_t i{0}; i < poolSize_; ++i) {
+ auto *stub{reinterpret_cast<uint32_t *>(code + i * kTrampolineStubSize)};
+
+ // Address of the corresponding TDATA entry.
+ auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])};
+
+ // ldr x17, .Ldata_addr (PC-relative load, offset = 4 instructions = 16
+ // bytes) LDR (literal): opc=01, V=0, imm19=(16/4)=4, Rt=17
+ stub[0] = 0x58000091; // ldr x17, #16 (imm19=4, shifted left 2 = 16)
+ // Encoding: 0101 1000 0000 0000 0000 0000 1001 0001
+
+ // ldr x15, [x17, #8] (load static chain into x15, the nest register)
+ // LDR (unsigned offset): size=11, V=0, opc=01, imm12=1(×8), Rn=17, Rt=15
+ stub[1] = 0xF940062F; // ldr x15, [x17, #8]
+
+ // ldr x17, [x17] (load callee address)
+ // LDR (unsigned offset): size=11, V=0, opc=01, imm12=0, Rn=17, Rt=17
+ stub[2] = 0xF9400231; // ldr x17, [x17, #0]
+
+ // br x17
+ stub[3] = 0xD61F0220; // br x17
+
+ // .Ldata_addr: .quad dataRegion_[i]
+ runtime::memcpy(&stub[4], &dataAddr, 8);
+
+ // Pad remaining with BRK #0 (trap) for safety.
+ std::size_t usedWords{4 + 2}; // 4 instructions + 1 quad (2 words)
+ for (std::size_t w{usedWords}; w < kTrampolineStubSize / sizeof(uint32_t);
+ ++w) {
+ stub[w] = 0xD4200000; // brk #0
+ }
+ }
+ }
+#endif
+
+ Lock lock_;
+ bool initialized_{false};
+ std::size_t poolSize_{0};
+
+ void *codeRegion_{nullptr}; // RX after initialization
+ TrampolineData *dataRegion_{nullptr}; // RW always
+ std::size_t *freeList_{nullptr}; // Intrusive free list
+ std::size_t freeHead_{kInvalidIndex};
+};
+
+} // namespace Fortran::runtime::trampoline
+
+namespace Fortran::runtime {
+extern "C" {
+
+// Helper: crash with a clear message on unsupported architectures.
+// This is only reached if -fsafe-trampoline was used on a target
+// that lacks stub generators. The driver should emit a warning and
+// ignore the flag on unsupported architectures, but the runtime
+// provides a safety net.
+static inline void crashIfUnsupported() {
+#if !TRAMPOLINE_ARCH_SUPPORTED
+ Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Runtime trampolines are not supported on this "
+ "architecture. Recompile without -fsafe-trampoline "
+ "to use the legacy stack-trampoline path.");
+#endif
+}
+
+void *RTDEF(TrampolineInit)(
+ void *scratch, const void *calleeAddress, const void *staticChainAddress) {
+ crashIfUnsupported();
+ auto &pool{trampoline::TrampolinePool::instance()};
+ return pool.allocate(calleeAddress, staticChainAddress);
+}
+
+void *RTDEF(TrampolineAdjust)(void *handle) {
+ crashIfUnsupported();
+ auto &pool{trampoline::TrampolinePool::instance()};
+ return pool.getCallableAddress(
+ static_cast<trampoline::TrampolineHandle *>(handle));
+}
+
+void RTDEF(TrampolineFree)(void *handle) {
+ crashIfUnsupported();
+ auto &pool{trampoline::TrampolinePool::instance()};
+ pool.free(static_cast<trampoline::TrampolineHandle *>(handle));
+}
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang-rt/test/Driver/safe-trampoline-gnustack.f90 b/flang-rt/test/Driver/safe-trampoline-gnustack.f90
new file mode 100644
index 0000000000000..88ef62d436e51
--- /dev/null
+++ b/flang-rt/test/Driver/safe-trampoline-gnustack.f90
@@ -0,0 +1,45 @@
+! UNSUPPORTED: system-windows
+! UNSUPPORTED: offload-cuda
+! UNSUPPORTED: system-darwin
+
+! Verify that -fsafe-trampoline produces an executable whose
+! GNU_STACK program header is RW (not RWE), proving W^X compliance.
+! The legacy stack-trampoline path requires an executable stack; the
+! runtime trampoline pool does not.
+
+! RUN: %flang %isysroot -fsafe-trampoline -L"%libdir" %s -o %t
+! RUN: llvm-readelf -lW %t | FileCheck %s
+
+! Ensure GNU_STACK exists and has RW flags (no E).
+! CHECK: GNU_STACK
+! CHECK-SAME: RW
+! CHECK-NOT: RWE
+
+subroutine host_proc(x, res)
+ implicit none
+ integer, intent(in) :: x
+ integer, intent(out) :: res
+
+ interface
+ function f_iface() result(r)
+ integer :: r
+ end function
+ end interface
+
+ procedure(f_iface), pointer :: fptr
+ fptr => inner
+ res = fptr()
+
+contains
+ function inner() result(r)
+ integer :: r
+ r = x + 1
+ end function
+end subroutine
+
+program test_gnustack
+ implicit none
+ integer :: result
+ call host_proc(1, result)
+ print *, result
+end program
diff --git a/flang/docs/RuntimeEnvironment.md b/flang/docs/RuntimeEnvironment.md
index 073826508927b..0414ea1e0d59c 100644
--- a/flang/docs/RuntimeEnvironment.md
+++ b/flang/docs/RuntimeEnvironment.md
@@ -71,3 +71,16 @@ compilers, but it is not mentioned in the standard.
Set `FORT_NO_EMPTY_ALLOCATION=1` to cause `ALLOCATE` statements
fail when the allocated size is empty.
+
+## `FLANG_TRAMPOLINE_POOL_SIZE`
+
+Set `FLANG_TRAMPOLINE_POOL_SIZE` to an integer value to control the maximum
+number of runtime trampoline slots available when `-fsafe-trampoline` is
+enabled. Each slot consists of a small executable code stub (size varies by
+target; e.g. 32 bytes on x86-64 and AArch64) backed by a writable data entry.
+The default is 1024 slots, which is sufficient for typical Fortran
+programs. If more internal-procedure closures are alive simultaneously than
+the pool can hold, the runtime terminates with a diagnostic message that
+includes the current pool capacity.
+
+Example: `export FLANG_TRAMPOLINE_POOL_SIZE=4096`
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index 05ee0e28bcaa6..fff6702cd6964 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -42,6 +42,7 @@ CODEGENOPT(PrepareForThinLTO , 1, 0) ///< Set when -flto=thin is enabled on the
///< compile step.
CODEGENOPT(ProtectParens, 1, 1) ///< -fprotect-parens (enable parenthesis protection)
CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass)
+CODEGENOPT(EnableSafeTrampoline, 1, 0) ///< -fsafe-trampoline (W^X compliant trampoline pool)
CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization.
CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization.
CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange.
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 2c74ab29f75e8..89d1aff4b26a8 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -244,6 +244,10 @@ constexpr TypeBuilderFunc getModel<void *>() {
};
}
template <>
+constexpr TypeBuilderFunc getModel<const void *>() {
+ return getModel<void *>();
+}
+template <>
constexpr TypeBuilderFunc getModel<void (*)(int)>() {
return [](mlir::MLIRContext *context) -> mlir::Type {
return fir::LLVMPointerType::get(
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h b/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h
new file mode 100644
index 0000000000000..45e079818c1fc
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h
@@ -0,0 +1,47 @@
+//===-- Trampoline.h - Runtime trampoline pool builder ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Builder routines for generating calls to the Fortran runtime trampoline
+// pool APIs (_FortranATrampolineInit, _FortranATrampolineAdjust,
+// _FortranATrampolineFree).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H
+
+namespace mlir {
+class Value;
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+}
+
+namespace fir::runtime {
+
+/// Generate a call to _FortranATrampolineInit.
+/// Returns an opaque handle (void*) for the trampoline.
+mlir::Value genTrampolineInit(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value scratch, mlir::Value calleeAddress,
+ mlir::Value staticChainAddress);
+
+/// Generate a call to _FortranATrampolineAdjust.
+/// Returns the callable function pointer for the trampoline.
+mlir::Value genTrampolineAdjust(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value handle);
+
+/// Generate a call to _FortranATrampolineFree.
+/// Frees the trampoline slot.
+void genTrampolineFree(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value handle);
+
+} // namespace fir::runtime
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H
diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
index df0ecf5540776..22025c19121f3 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
@@ -91,12 +91,18 @@ def TargetRewritePass : Pass<"target-rewrite", "mlir::ModuleOp"> {
}
def BoxedProcedurePass : Pass<"boxed-procedure", "mlir::ModuleOp"> {
- let options = [
- Option<"useThunks", "use-thunks",
- "bool", /*default=*/"true",
+ let options =
+ [Option<
+ "useThunks", "use-thunks", "bool", /*default=*/"true",
"Convert procedure pointer abstractions to a single code pointer, "
- "deploying thunks wherever required.">
- ];
+ "deploying thunks wherever required.">,
+ Option<
+ "useSafeTrampoline", "use-safe-trampoline", "bool",
+ /*default=*/"false",
+ "Use runtime trampoline pool instead of stack-based trampolines "
+ "for W^X compliance. When enabled, internal procedure pointers "
+ "use a runtime-managed pool of executable trampolines with "
+ "separate data region, avoiding the need for an executable stack.">];
}
def LowerRepackArraysPass : Pass<"lower-repack-arrays", "mlir::ModuleOp"> {
diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
index 1dc8e4f079d72..882f02032a3b8 100644
--- a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
+++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
@@ -65,6 +65,7 @@ extern llvm::cl::opt<bool> disableDebugInfo;
extern llvm::cl::opt<bool> disableFirToLlvmIr;
extern llvm::cl::opt<bool> disableLlvmIrToLlvm;
extern llvm::cl::opt<bool> disableBoxedProcedureRewrite;
+extern llvm::cl::opt<bool> enableSafeTrampoline;
extern llvm::cl::opt<bool> disableExternalNameConversion;
extern llvm::cl::opt<bool> enableConstantArgumentGlobalisation;
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 1a7ff4ff3dfa2..04e0f096b36e3 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -93,7 +93,8 @@ void addTargetRewritePass(mlir::PassManager &pm);
mlir::LLVM::DIEmissionKind
getEmissionKind(llvm::codegenoptions::DebugInfoKind kind);
-void addBoxedProcedurePass(mlir::PassManager &pm);
+void addBoxedProcedurePass(mlir::PassManager &pm,
+ bool enableSafeTrampoline = false);
void addExternalNameConversionPass(mlir::PassManager &pm,
bool appendUnderscore = true);
diff --git a/flang/include/flang/Runtime/trampoline.h b/flang/include/flang/Runtime/trampoline.h
new file mode 100644
index 0000000000000..3322df8b1b340
--- /dev/null
+++ b/flang/include/flang/Runtime/trampoline.h
@@ -0,0 +1,69 @@
+//===-- include/flang/Runtime/trampoline.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Runtime support for W^X-compliant trampoline pool management.
+//
+// This provides an alternative to stack-based trampolines for internal
+// procedures with host association. Instead of requiring the stack to be
+// both writable and executable (violating W^X security policies), this
+// implementation uses a pool of pre-assembled trampolines in a separate
+// executable (but not writable) memory region, paired with writable (but
+// not executable) data entries.
+//
+// See flang/docs/InternalProcedureTrampolines.md for design details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_TRAMPOLINE_H_
+#define FORTRAN_RUNTIME_TRAMPOLINE_H_
+
+#include "flang/Runtime/entry-names.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+/// Initializes a new trampoline and returns its internal handle.
+///
+/// Allocates a trampoline entry from the pool, configuring it to call
+/// \p calleeAddress with the static chain pointer \p staticChainAddress
+/// set in the appropriate register (per target ABI).
+///
+/// \p scratch is reserved for future use (e.g., fallback to stack
+/// trampolines). Pass nullptr for pool-based allocation.
+///
+/// The returned handle must be passed to FreeTrampoline() when the
+/// host procedure exits.
+///
+/// Pool capacity: The pool is fixed-size (default 1024 slots, configurable
+/// via FLANG_TRAMPOLINE_POOL_SIZE env var). If all slots are in use, the
+/// runtime issues a fatal error. Dynamic slab growth may be added later.
+///
+/// Architecture support: Currently x86-64 and AArch64. On unsupported
+/// architectures, calling this function issues a fatal diagnostic.
+void *RTDECL(TrampolineInit)(
+ void *scratch, const void *calleeAddress, const void *staticChainAddress);
+
+/// Returns the callable trampoline address for the given handle.
+///
+/// \p handle is a value returned by TrampolineInit().
+/// The result is a function pointer that can be called directly; it will
+/// set up the static chain register and jump to the original callee.
+void *RTDECL(TrampolineAdjust)(void *handle);
+
+/// Frees the trampoline entry associated with the given handle.
+///
+/// Must be called at every exit from the host procedure to return the
+/// trampoline slot to the pool. After this call, any function pointer
+/// previously obtained via TrampolineAdjust() for this handle becomes
+/// invalid.
+void RTDECL(TrampolineFree)(void *handle);
+
+} // extern "C"
+} // namespace Fortran::runtime
+
+#endif // FORTRAN_RUNTIME_TRAMPOLINE_H_
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 44fb252d2b366..af96f655adec8 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -88,6 +88,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
const Fortran::common::MathOptionsBase &mathOpts) {
OptLevel = level;
StackArrays = opts.StackArrays;
+ EnableSafeTrampoline = opts.EnableSafeTrampoline;
Underscoring = opts.Underscoring;
LoopVersioning = opts.LoopVersioning;
DebugInfo = opts.getDebugInfo();
@@ -114,6 +115,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
llvm::OptimizationLevel OptLevel; ///< optimisation level
bool StackArrays = false; ///< convert memory allocations to alloca.
+ bool EnableSafeTrampoline{false}; ///< Use runtime trampoline pool (W^X).
bool Underscoring = true; ///< add underscores to function names.
bool LoopVersioning = false; ///< Run the version loop pass.
bool AliasAnalysis = false; ///< Add TBAA tags to generated LLVMIR.
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index fc4975f9592eb..e5ad5589895dd 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -284,6 +284,10 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
clang::options::OPT_fno_stack_arrays, false))
opts.StackArrays = 1;
+ if (args.hasFlag(clang::options::OPT_fsafe_trampoline,
+ clang::options::OPT_fno_safe_trampoline, false))
+ opts.EnableSafeTrampoline = 1;
+
if (args.getLastArg(clang::options::OPT_floop_interchange))
opts.InterchangeLoops = 1;
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index d966c52b29c33..8e5b5deff47c1 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -34,6 +34,7 @@ add_flang_library(FIRBuilder
Runtime/Stop.cpp
Runtime/Support.cpp
Runtime/TemporaryStack.cpp
+ Runtime/Trampoline.cpp
Runtime/Transformational.cpp
TemporaryStorage.cpp
diff --git a/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp b/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp
new file mode 100644
index 0000000000000..977b0ca721400
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp
@@ -0,0 +1,49 @@
+//===-- Trampoline.cpp - Runtime trampoline pool builder --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/Trampoline.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Runtime/trampoline.h"
+
+using namespace Fortran::runtime;
+using namespace fir::runtime;
+
+mlir::Value fir::runtime::genTrampolineInit(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ mlir::Value scratch,
+ mlir::Value calleeAddress,
+ mlir::Value staticChainAddress) {
+ mlir::func::FuncOp func{
+ getRuntimeFunc<mkRTKey(TrampolineInit)>(loc, builder)};
+ mlir::FunctionType fTy{func.getFunctionType()};
+ llvm::SmallVector<mlir::Value> args{createArguments(
+ builder, loc, fTy, scratch, calleeAddress, staticChainAddress)};
+ return fir::CallOp::create(builder, loc, func, args).getResult(0);
+}
+
+mlir::Value fir::runtime::genTrampolineAdjust(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ mlir::Value handle) {
+ mlir::func::FuncOp func{
+ getRuntimeFunc<mkRTKey(TrampolineAdjust)>(loc, builder)};
+ mlir::FunctionType fTy{func.getFunctionType()};
+ llvm::SmallVector<mlir::Value> args{
+ createArguments(builder, loc, fTy, handle)};
+ return fir::CallOp::create(builder, loc, func, args).getResult(0);
+}
+
+void fir::runtime::genTrampolineFree(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Value handle) {
+ mlir::func::FuncOp func{
+ getRuntimeFunc<mkRTKey(TrampolineFree)>(loc, builder)};
+ mlir::FunctionType fTy{func.getFunctionType()};
+ llvm::SmallVector<mlir::Value> args{
+ createArguments(builder, loc, fTy, handle)};
+ fir::CallOp::create(builder, loc, func, args);
+}
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 61d6d2ae6329a..16618284abaca 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -10,6 +10,7 @@
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
+#include "flang/Optimizer/Builder/Runtime/Trampoline.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
@@ -20,6 +21,7 @@
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
namespace fir {
#define GEN_PASS_DEF_BOXEDPROCEDUREPASS
@@ -31,12 +33,6 @@ namespace fir {
using namespace fir;
namespace {
-/// Options to the procedure pointer pass.
-struct BoxedProcedureOptions {
- // Lower the boxproc abstraction to function pointers and thunks where
- // required.
- bool useThunks = true;
-};
/// This type converter rewrites all `!fir.boxproc<Func>` types to `Func` types.
class BoxprocTypeRewriter : public mlir::TypeConverter {
@@ -219,200 +215,363 @@ class BoxedProcedurePass
inline mlir::ModuleOp getModule() { return getOperation(); }
void runOnOperation() override final {
- if (options.useThunks) {
+ if (useThunks) {
auto *context = &getContext();
mlir::IRRewriter rewriter(context);
BoxprocTypeRewriter typeConverter(mlir::UnknownLoc::get(context));
- getModule().walk([&](mlir::Operation *op) {
- bool opIsValid = true;
- typeConverter.setLocation(op->getLoc());
- if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) {
- mlir::Type ty = addr.getVal().getType();
- mlir::Type resTy = addr.getResult().getType();
- if (llvm::isa<mlir::FunctionType>(ty) ||
- llvm::isa<fir::BoxProcType>(ty)) {
- // Rewrite all `fir.box_addr` ops on values of type `!fir.boxproc`
- // or function type to be `fir.convert` ops.
- rewriter.setInsertionPoint(addr);
- rewriter.replaceOpWithNewOp<ConvertOp>(
- addr, typeConverter.convertType(addr.getType()), addr.getVal());
- opIsValid = false;
- } else if (typeConverter.needsConversion(resTy)) {
- rewriter.startOpModification(op);
- op->getResult(0).setType(typeConverter.convertType(resTy));
- rewriter.finalizeOpModification(op);
- }
- } else if (auto func = mlir::dyn_cast<mlir::func::FuncOp>(op)) {
- mlir::FunctionType ty = func.getFunctionType();
- if (typeConverter.needsConversion(ty)) {
- rewriter.startOpModification(func);
- auto toTy =
- mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty));
- if (!func.empty())
- for (auto e : llvm::enumerate(toTy.getInputs())) {
- unsigned i = e.index();
- auto &block = func.front();
- block.insertArgument(i, e.value(), func.getLoc());
- block.getArgument(i + 1).replaceAllUsesWith(
- block.getArgument(i));
- block.eraseArgument(i + 1);
- }
- func.setType(toTy);
- rewriter.finalizeOpModification(func);
+
+ // When using safe trampolines, we need to track handles per
+ // function so we can insert FreeTrampoline calls at each return.
+ // Process functions individually to manage this state.
+ if (useSafeTrampoline) {
+ getModule().walk([&](mlir::func::FuncOp funcOp) {
+ trampolineHandles.clear();
+ trampolineCallableMap.clear();
+ processFunction(funcOp, rewriter, typeConverter);
+ insertTrampolineFrees(funcOp, rewriter);
+ });
+ // Also process non-function ops at module level (globals, etc.)
+ processModuleLevelOps(rewriter, typeConverter);
+ } else {
+ getModule().walk([&](mlir::Operation *op) {
+ processOp(op, rewriter, typeConverter);
+ });
+ }
+ }
+ }
+
+private:
+ /// Trampoline handles collected while processing a function.
+ /// Each entry is a Value representing the opaque handle returned
+ /// by _FortranATrampolineInit, which must be freed before the
+ /// function returns.
+ llvm::SmallVector<mlir::Value> trampolineHandles;
+
+ /// Cache of trampoline callable addresses keyed by the func SSA value
+ /// of the emboxproc. This deduplicates trampolines when the same
+ /// internal procedure is emboxed multiple times in one host function.
+ llvm::DenseMap<mlir::Value, mlir::Value> trampolineCallableMap;
+
+ /// Process all ops within a function.
+ void processFunction(mlir::func::FuncOp funcOp, mlir::IRRewriter &rewriter,
+ BoxprocTypeRewriter &typeConverter) {
+ funcOp.walk(
+ [&](mlir::Operation *op) { processOp(op, rewriter, typeConverter); });
+ }
+
+ /// Process non-function ops at module level (globals, etc.)
+ void processModuleLevelOps(mlir::IRRewriter &rewriter,
+ BoxprocTypeRewriter &typeConverter) {
+ for (auto &op : getModule().getBody()->getOperations())
+ if (!mlir::isa<mlir::func::FuncOp>(op))
+ processOp(&op, rewriter, typeConverter);
+ }
+
+ /// Insert _FortranATrampolineFree calls before every return in the function.
+ void insertTrampolineFrees(mlir::func::FuncOp funcOp,
+ mlir::IRRewriter &rewriter) {
+ if (trampolineHandles.empty())
+ return;
+
+ auto module{funcOp->getParentOfType<mlir::ModuleOp>()};
+ // Insert TrampolineFree calls before every func.return in this function.
+ // At this pass stage (after CFGConversion), func.return is the only
+ // terminator that exits the function. Other terminators are either
+ // intra-function branches (cf.br, cf.cond_br, fir.select*) or
+ // fir.unreachable (after STOP/ERROR STOP), which don't need cleanup
+ // since the process is terminating.
+ funcOp.walk([&](mlir::func::ReturnOp retOp) {
+ rewriter.setInsertionPoint(retOp);
+ FirOpBuilder builder(rewriter, module);
+ auto loc{retOp.getLoc()};
+ for (mlir::Value handle : trampolineHandles)
+ fir::runtime::genTrampolineFree(builder, loc, handle);
+ });
+ }
+
+ /// Process a single operation for boxproc type rewriting.
+ void processOp(mlir::Operation *op, mlir::IRRewriter &rewriter,
+ BoxprocTypeRewriter &typeConverter) {
+ bool opIsValid{true};
+ typeConverter.setLocation(op->getLoc());
+ if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) {
+ mlir::Type ty{addr.getVal().getType()};
+ mlir::Type resTy{addr.getResult().getType()};
+ if (llvm::isa<mlir::FunctionType>(ty) ||
+ llvm::isa<fir::BoxProcType>(ty)) {
+ // Rewrite all `fir.box_addr` ops on values of type `!fir.boxproc`
+ // or function type to be `fir.convert` ops.
+ rewriter.setInsertionPoint(addr);
+ rewriter.replaceOpWithNewOp<ConvertOp>(
+ addr, typeConverter.convertType(addr.getType()), addr.getVal());
+ opIsValid = false;
+ } else if (typeConverter.needsConversion(resTy)) {
+ rewriter.startOpModification(op);
+ op->getResult(0).setType(typeConverter.convertType(resTy));
+ rewriter.finalizeOpModification(op);
+ }
+ } else if (auto func = mlir::dyn_cast<mlir::func::FuncOp>(op)) {
+ mlir::FunctionType ty{func.getFunctionType()};
+ if (typeConverter.needsConversion(ty)) {
+ rewriter.startOpModification(func);
+ auto toTy{
+ mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty))};
+ if (!func.empty())
+ for (auto e : llvm::enumerate(toTy.getInputs())) {
+ auto i{static_cast<unsigned>(e.index())};
+ auto &block{func.front()};
+ block.insertArgument(i, e.value(), func.getLoc());
+ block.getArgument(i + 1).replaceAllUsesWith(block.getArgument(i));
+ block.eraseArgument(i + 1);
}
- } else if (auto embox = mlir::dyn_cast<EmboxProcOp>(op)) {
- // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk
- // as required.
- mlir::Type toTy = typeConverter.convertType(
- mlir::cast<BoxProcType>(embox.getType()).getEleTy());
- rewriter.setInsertionPoint(embox);
- if (embox.getHost()) {
- // Create the thunk.
- auto module = embox->getParentOfType<mlir::ModuleOp>();
- FirOpBuilder builder(rewriter, module);
- const auto triple{fir::getTargetTriple(module)};
- auto loc = embox.getLoc();
- mlir::Type i8Ty = builder.getI8Type();
- mlir::Type i8Ptr = builder.getRefType(i8Ty);
- // For PPC32 and PPC64, the thunk is populated by a call to
- // __trampoline_setup, which is defined in
- // compiler-rt/lib/builtins/trampoline_setup.c and requires the
- // thunk size greater than 32 bytes. For AArch64, RISCV and x86_64,
- // the thunk setup doesn't go through __trampoline_setup and fits in
- // 32 bytes.
- fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
- mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
- auto buffer = AllocaOp::create(builder, loc, buffTy);
- mlir::Value closure =
- builder.createConvert(loc, i8Ptr, embox.getHost());
- mlir::Value tramp = builder.createConvert(loc, i8Ptr, buffer);
- mlir::Value func =
- builder.createConvert(loc, i8Ptr, embox.getFunc());
- fir::CallOp::create(
- builder, loc, factory::getLlvmInitTrampoline(builder),
- llvm::ArrayRef<mlir::Value>{tramp, func, closure});
- auto adjustCall = fir::CallOp::create(
- builder, loc, factory::getLlvmAdjustTrampoline(builder),
- llvm::ArrayRef<mlir::Value>{tramp});
+ func.setType(toTy);
+ rewriter.finalizeOpModification(func);
+ }
+ } else if (auto embox = mlir::dyn_cast<EmboxProcOp>(op)) {
+ // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk
+ // as required.
+ mlir::Type toTy{typeConverter.convertType(
+ mlir::cast<BoxProcType>(embox.getType()).getEleTy())};
+ rewriter.setInsertionPoint(embox);
+ if (embox.getHost()) {
+ auto module{embox->getParentOfType<mlir::ModuleOp>()};
+ auto loc{embox.getLoc()};
+
+ if (useSafeTrampoline) {
+ // Runtime trampoline pool path (W^X compliant).
+ // Insert Init/Adjust in the function's entry block so the
+ // handle dominates all func.return ops where TrampolineFree
+ // is emitted. This is necessary because fir.emboxproc may
+ // appear inside control flow branches. A cache avoids
+ // creating duplicate trampolines for the same internal
+ // procedure within a single host function.
+ mlir::Value funcVal{embox.getFunc()};
+ auto cacheIt{trampolineCallableMap.find(funcVal)};
+ if (cacheIt != trampolineCallableMap.end()) {
rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
- adjustCall.getResult(0));
- opIsValid = false;
+ cacheIt->second);
} else {
- // Just forward the function as a pointer.
- rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
- embox.getFunc());
- opIsValid = false;
- }
- } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) {
- auto ty = global.getType();
- if (typeConverter.needsConversion(ty)) {
- rewriter.startOpModification(global);
- auto toTy = typeConverter.convertType(ty);
- global.setType(toTy);
- rewriter.finalizeOpModification(global);
- }
- } else if (auto mem = mlir::dyn_cast<AllocaOp>(op)) {
- auto ty = mem.getType();
- if (typeConverter.needsConversion(ty)) {
- rewriter.setInsertionPoint(mem);
- auto toTy = typeConverter.convertType(unwrapRefType(ty));
- bool isPinned = mem.getPinned();
- llvm::StringRef uniqName =
- mem.getUniqName().value_or(llvm::StringRef());
- llvm::StringRef bindcName =
- mem.getBindcName().value_or(llvm::StringRef());
- rewriter.replaceOpWithNewOp<AllocaOp>(
- mem, toTy, uniqName, bindcName, isPinned, mem.getTypeparams(),
- mem.getShape());
- opIsValid = false;
- }
- } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) {
- auto ty = mem.getType();
- if (typeConverter.needsConversion(ty)) {
- rewriter.setInsertionPoint(mem);
- auto toTy = typeConverter.convertType(unwrapRefType(ty));
- llvm::StringRef uniqName =
- mem.getUniqName().value_or(llvm::StringRef());
- llvm::StringRef bindcName =
- mem.getBindcName().value_or(llvm::StringRef());
- rewriter.replaceOpWithNewOp<AllocMemOp>(
- mem, toTy, uniqName, bindcName, mem.getTypeparams(),
- mem.getShape());
- opIsValid = false;
- }
- } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) {
- auto ty = coor.getType();
- mlir::Type baseTy = coor.getBaseType();
- if (typeConverter.needsConversion(ty) ||
- typeConverter.needsConversion(baseTy)) {
- rewriter.setInsertionPoint(coor);
- auto toTy = typeConverter.convertType(ty);
- auto toBaseTy = typeConverter.convertType(baseTy);
- rewriter.replaceOpWithNewOp<CoordinateOp>(
- coor, toTy, coor.getRef(), coor.getCoor(), toBaseTy,
- coor.getFieldIndicesAttr());
- opIsValid = false;
- }
- } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) {
- auto ty = index.getType();
- mlir::Type onTy = index.getOnType();
- if (typeConverter.needsConversion(ty) ||
- typeConverter.needsConversion(onTy)) {
- rewriter.setInsertionPoint(index);
- auto toTy = typeConverter.convertType(ty);
- auto toOnTy = typeConverter.convertType(onTy);
- rewriter.replaceOpWithNewOp<FieldIndexOp>(
- index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
- opIsValid = false;
- }
- } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) {
- auto ty = index.getType();
- mlir::Type onTy = index.getOnType();
- if (typeConverter.needsConversion(ty) ||
- typeConverter.needsConversion(onTy)) {
- rewriter.setInsertionPoint(index);
- auto toTy = typeConverter.convertType(ty);
- auto toOnTy = typeConverter.convertType(onTy);
- rewriter.replaceOpWithNewOp<LenParamIndexOp>(
- index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
- opIsValid = false;
- }
- } else {
- rewriter.startOpModification(op);
- // Convert the operands if needed
- for (auto i : llvm::enumerate(op->getResultTypes()))
- if (typeConverter.needsConversion(i.value())) {
- auto toTy = typeConverter.convertType(i.value());
- op->getResult(i.index()).setType(toTy);
+ auto parentFunc{embox->getParentOfType<mlir::func::FuncOp>()};
+ auto &entryBlock{parentFunc.front()};
+
+ auto savedIP{rewriter.saveInsertionPoint()};
+
+ // Find the right insertion point in the entry block.
+ // Walk up from the emboxproc to find its top-level
+ // ancestor in the entry block. For an emboxproc directly
+ // in the entry block, this is the emboxproc itself.
+ // For one inside a structured op (fir.if, fir.do_loop),
+ // this is that structured op. For one inside an explicit
+ // branch target (cf.cond_br → ^bb1), we fall back to the
+ // entry block terminator.
+ mlir::Operation *entryAncestor{embox.getOperation()};
+ while (entryAncestor->getBlock() != &entryBlock) {
+ entryAncestor = entryAncestor->getParentOp();
+ if (!entryAncestor ||
+ mlir::isa<mlir::func::FuncOp>(entryAncestor))
+ break;
}
+ bool ancestorInEntry{
+ entryAncestor &&
+ !mlir::isa<mlir::func::FuncOp>(entryAncestor) &&
+ entryAncestor->getBlock() == &entryBlock};
- // Convert the type attributes if needed
- for (const mlir::NamedAttribute &attr : op->getAttrDictionary())
- if (auto tyAttr = llvm::dyn_cast<mlir::TypeAttr>(attr.getValue()))
- if (typeConverter.needsConversion(tyAttr.getValue())) {
- auto toTy = typeConverter.convertType(tyAttr.getValue());
- op->setAttr(attr.getName(), mlir::TypeAttr::get(toTy));
+ // If the func value is not in the entry block (e.g.,
+ // address_of generated inside a structured fir.if),
+ // clone it into the entry block.
+ mlir::Value funcValInEntry{funcVal};
+ if (auto *funcDef{funcVal.getDefiningOp()}) {
+ if (funcDef->getBlock() != &entryBlock) {
+ if (ancestorInEntry)
+ rewriter.setInsertionPoint(entryAncestor);
+ else
+ rewriter.setInsertionPoint(entryBlock.getTerminator());
+ auto *cloned{rewriter.clone(*funcDef)};
+ funcValInEntry = cloned->getResult(0);
}
- rewriter.finalizeOpModification(op);
+ }
+
+ // The host link (closure pointer) must already be in the entry
+ // block. In practice it is always either a function block argument
+ // or an alloca emitted at function entry by the lowering — cloning
+ // just the defining op would miss any stores that initialise it,
+ // producing incorrect code. Assert that invariant rather than
+ // attempting a broken clone.
+ mlir::Value hostValInEntry{embox.getHost()};
+ if (auto *hostDef{embox.getHost().getDefiningOp()}) {
+ if (hostDef->getBlock() != &entryBlock) {
+ mlir::emitError(loc,
+ "host link value is not defined in the entry "
+ "block of the host function; cannot hoist "
+ "TrampolineInit safely");
+ return;
+ }
+ }
+
+ // Insert Init/Adjust at the determined position.
+ FirOpBuilder builder(rewriter, module);
+ if (ancestorInEntry)
+ builder.setInsertionPoint(entryAncestor);
+ else
+ builder.setInsertionPoint(entryBlock.getTerminator());
+ mlir::Type i8Ty{builder.getI8Type()};
+ mlir::Type i8Ptr{builder.getRefType(i8Ty)};
+
+ mlir::Value nullPtr{builder.createNullConstant(loc, i8Ptr)};
+ mlir::Value closure{
+ builder.createConvert(loc, i8Ptr, hostValInEntry)};
+ mlir::Value func{builder.createConvert(loc, i8Ptr, funcValInEntry)};
+
+ // _FortranATrampolineInit(nullptr, func, closure) -> handle
+ mlir::Value handle{fir::runtime::genTrampolineInit(
+ builder, loc, nullPtr, func, closure)};
+
+ // _FortranATrampolineAdjust(handle) -> callable address
+ mlir::Value callableAddr{
+ fir::runtime::genTrampolineAdjust(builder, loc, handle)};
+
+ trampolineHandles.push_back(handle);
+ trampolineCallableMap[funcVal] = callableAddr;
+
+ rewriter.restoreInsertionPoint(savedIP);
+ rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, callableAddr);
+ }
+ } else {
+ // Legacy stack-based trampoline path.
+ FirOpBuilder builder(rewriter, module);
+ mlir::Type i8Ty{builder.getI8Type()};
+ mlir::Type i8Ptr{builder.getRefType(i8Ty)};
+ const auto triple{fir::getTargetTriple(module)};
+ // For PPC32 and PPC64, the thunk is populated by a call to
+ // __trampoline_setup, which is defined in
+ // compiler-rt/lib/builtins/trampoline_setup.c and requires the
+ // thunk size greater than 32 bytes. For AArch64, RISCV and
+ // x86_64, the thunk setup doesn't go through
+ // __trampoline_setup and fits in 32 bytes.
+ fir::SequenceType::Extent thunkSize{triple.getTrampolineSize()};
+ mlir::Type buffTy{SequenceType::get({thunkSize}, i8Ty)};
+ auto buffer{AllocaOp::create(builder, loc, buffTy)};
+ mlir::Value closure{
+ builder.createConvert(loc, i8Ptr, embox.getHost())};
+ mlir::Value tramp{builder.createConvert(loc, i8Ptr, buffer)};
+ mlir::Value func{builder.createConvert(loc, i8Ptr, embox.getFunc())};
+ fir::CallOp::create(
+ builder, loc, factory::getLlvmInitTrampoline(builder),
+ llvm::ArrayRef<mlir::Value>{tramp, func, closure});
+ auto adjustCall{fir::CallOp::create(
+ builder, loc, factory::getLlvmAdjustTrampoline(builder),
+ llvm::ArrayRef<mlir::Value>{tramp})};
+ rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
+ adjustCall.getResult(0));
}
- // Ensure block arguments are updated if needed.
- if (opIsValid && op->getNumRegions() != 0) {
- rewriter.startOpModification(op);
- for (mlir::Region ®ion : op->getRegions())
- for (mlir::Block &block : region.getBlocks())
- for (mlir::BlockArgument blockArg : block.getArguments())
- if (typeConverter.needsConversion(blockArg.getType())) {
- mlir::Type toTy =
- typeConverter.convertType(blockArg.getType());
- blockArg.setType(toTy);
- }
- rewriter.finalizeOpModification(op);
+ opIsValid = false;
+ } else {
+ // Just forward the function as a pointer.
+ rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, embox.getFunc());
+ opIsValid = false;
+ }
+ } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) {
+ auto ty{global.getType()};
+ if (typeConverter.needsConversion(ty)) {
+ rewriter.startOpModification(global);
+ auto toTy{typeConverter.convertType(ty)};
+ global.setType(toTy);
+ rewriter.finalizeOpModification(global);
+ }
+ } else if (auto mem = mlir::dyn_cast<AllocaOp>(op)) {
+ auto ty{mem.getType()};
+ if (typeConverter.needsConversion(ty)) {
+ rewriter.setInsertionPoint(mem);
+ auto toTy{typeConverter.convertType(unwrapRefType(ty))};
+ bool isPinned{mem.getPinned()};
+ llvm::StringRef uniqName{mem.getUniqName().value_or(llvm::StringRef())};
+ llvm::StringRef bindcName{
+ mem.getBindcName().value_or(llvm::StringRef())};
+ rewriter.replaceOpWithNewOp<AllocaOp>(mem, toTy, uniqName, bindcName,
+ isPinned, mem.getTypeparams(),
+ mem.getShape());
+ opIsValid = false;
+ }
+ } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) {
+ auto ty{mem.getType()};
+ if (typeConverter.needsConversion(ty)) {
+ rewriter.setInsertionPoint(mem);
+ auto toTy{typeConverter.convertType(unwrapRefType(ty))};
+ llvm::StringRef uniqName{mem.getUniqName().value_or(llvm::StringRef())};
+ llvm::StringRef bindcName{
+ mem.getBindcName().value_or(llvm::StringRef())};
+ rewriter.replaceOpWithNewOp<AllocMemOp>(mem, toTy, uniqName, bindcName,
+ mem.getTypeparams(),
+ mem.getShape());
+ opIsValid = false;
+ }
+ } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) {
+ auto ty{coor.getType()};
+ mlir::Type baseTy{coor.getBaseType()};
+ if (typeConverter.needsConversion(ty) ||
+ typeConverter.needsConversion(baseTy)) {
+ rewriter.setInsertionPoint(coor);
+ auto toTy{typeConverter.convertType(ty)};
+ auto toBaseTy{typeConverter.convertType(baseTy)};
+ rewriter.replaceOpWithNewOp<CoordinateOp>(coor, toTy, coor.getRef(),
+ coor.getCoor(), toBaseTy,
+ coor.getFieldIndicesAttr());
+ opIsValid = false;
+ }
+ } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) {
+ auto ty{index.getType()};
+ mlir::Type onTy{index.getOnType()};
+ if (typeConverter.needsConversion(ty) ||
+ typeConverter.needsConversion(onTy)) {
+ rewriter.setInsertionPoint(index);
+ auto toTy{typeConverter.convertType(ty)};
+ auto toOnTy{typeConverter.convertType(onTy)};
+ rewriter.replaceOpWithNewOp<FieldIndexOp>(
+ index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
+ opIsValid = false;
+ }
+ } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) {
+ auto ty{index.getType()};
+ mlir::Type onTy{index.getOnType()};
+ if (typeConverter.needsConversion(ty) ||
+ typeConverter.needsConversion(onTy)) {
+ rewriter.setInsertionPoint(index);
+ auto toTy{typeConverter.convertType(ty)};
+ auto toOnTy{typeConverter.convertType(onTy)};
+ rewriter.replaceOpWithNewOp<LenParamIndexOp>(
+ index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
+ opIsValid = false;
+ }
+ } else {
+ rewriter.startOpModification(op);
+ // Convert the operands if needed
+ for (auto i : llvm::enumerate(op->getResultTypes()))
+ if (typeConverter.needsConversion(i.value())) {
+ auto toTy{typeConverter.convertType(i.value())};
+ op->getResult(i.index()).setType(toTy);
}
- });
+
+ // Convert the type attributes if needed
+ for (const mlir::NamedAttribute &attr : op->getAttrDictionary())
+ if (auto tyAttr = llvm::dyn_cast<mlir::TypeAttr>(attr.getValue()))
+ if (typeConverter.needsConversion(tyAttr.getValue())) {
+ auto toTy{typeConverter.convertType(tyAttr.getValue())};
+ op->setAttr(attr.getName(), mlir::TypeAttr::get(toTy));
+ }
+ rewriter.finalizeOpModification(op);
+ }
+ // Ensure block arguments are updated if needed.
+ if (opIsValid && op->getNumRegions() != 0) {
+ rewriter.startOpModification(op);
+ for (mlir::Region ®ion : op->getRegions())
+ for (mlir::Block &block : region.getBlocks())
+ for (mlir::BlockArgument blockArg : block.getArguments())
+ if (typeConverter.needsConversion(blockArg.getType())) {
+ mlir::Type toTy{typeConverter.convertType(blockArg.getType())};
+ blockArg.setType(toTy);
+ }
+ rewriter.finalizeOpModification(op);
}
}
-
-private:
- BoxedProcedureOptions options;
};
} // namespace
diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
index 75e818d330b51..d461c1b9757b5 100644
--- a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
+++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
@@ -71,6 +71,8 @@ DisableOption(FirToLlvmIr, "fir-to-llvmir", "FIR to LLVM-IR dialect");
DisableOption(LlvmIrToLlvm, "llvm", "conversion to LLVM");
DisableOption(BoxedProcedureRewrite, "boxed-procedure-rewrite",
"rewrite boxed procedures");
+EnableOption(SafeTrampoline, "safe-trampoline",
+ "W^X compliant runtime trampoline pool");
DisableOption(ExternalNameConversion, "external-name-interop",
"convert names with external convention");
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 5927fff960270..4597f93047d72 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -140,9 +140,16 @@ void addLLVMDialectToLLVMPass(mlir::PassManager &pm,
});
}
-void addBoxedProcedurePass(mlir::PassManager &pm) {
- addPassConditionally(pm, disableBoxedProcedureRewrite,
- [&]() { return fir::createBoxedProcedurePass(); });
+void addBoxedProcedurePass(mlir::PassManager &pm,
+ bool enableSafeTrampolineFromConfig) {
+ addPassConditionally(pm, disableBoxedProcedureRewrite, [&]() {
+ fir::BoxedProcedurePassOptions opts;
+ // Support both the frontend -fsafe-trampoline flag (via config)
+ // and the cl::opt --safe-trampoline (for fir-opt/tco tools).
+ opts.useSafeTrampoline =
+ enableSafeTrampolineFromConfig || enableSafeTrampoline;
+ return fir::createBoxedProcedurePass(opts);
+ });
}
void addExternalNameConversionPass(mlir::PassManager &pm,
@@ -371,7 +378,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
MLIRToLLVMPassPipelineConfig config,
llvm::StringRef inputFilename) {
pm.addPass(fir::createMIFOpConversion());
- fir::addBoxedProcedurePass(pm);
+ fir::addBoxedProcedurePass(pm, config.EnableSafeTrampoline);
if (config.OptLevel.isOptimizingForSpeed() && config.AliasAnalysis &&
!disableFirAliasTags && !useOldAliasTags)
pm.addPass(fir::createAddAliasTags());
diff --git a/flang/test/Driver/fsafe-trampoline.f90 b/flang/test/Driver/fsafe-trampoline.f90
new file mode 100644
index 0000000000000..87243e0d8127a
--- /dev/null
+++ b/flang/test/Driver/fsafe-trampoline.f90
@@ -0,0 +1,13 @@
+! Test that -fsafe-trampoline is properly forwarded from driver to
+! frontend, and that -fno-safe-trampoline (default) works.
+
+! RUN: %flang -### -fsafe-trampoline %s 2>&1 | FileCheck %s --check-prefix=ON
+! RUN: %flang -### -fno-safe-trampoline %s 2>&1 | FileCheck %s --check-prefix=OFF
+! RUN: %flang -### %s 2>&1 | FileCheck %s --check-prefix=OFF
+! RUN: %flang -### -fsafe-trampoline -fno-safe-trampoline %s 2>&1 | FileCheck %s --check-prefix=OFF
+
+! ON: "-fsafe-trampoline"
+! OFF-NOT: "-fsafe-trampoline"
+
+program dummy
+end program
diff --git a/flang/test/Fir/boxproc-safe-trampoline.fir b/flang/test/Fir/boxproc-safe-trampoline.fir
new file mode 100644
index 0000000000000..de310bccaf19d
--- /dev/null
+++ b/flang/test/Fir/boxproc-safe-trampoline.fir
@@ -0,0 +1,99 @@
+// RUN: fir-opt --boxed-procedure="use-safe-trampoline=true" %s | FileCheck %s
+
+// Test that the --boxed-procedure pass with use-safe-trampoline=true
+// generates calls to _FortranATrampolineInit, _FortranATrampolineAdjust,
+// and _FortranATrampolineFree instead of llvm.init.trampoline and
+// llvm.adjust.trampoline intrinsics.
+
+// CHECK-LABEL: func.func @_QPtest_proc_dummy()
+// CHECK: fir.zero_bits !fir.ref<i8>
+// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.ref<i8>
+// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.ref<i8>
+// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8>
+// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8>
+// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8>
+// CHECK: %[[HANDLE:.*]] = fir.call @_FortranATrampolineInit({{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[ADDR:.*]] = fir.call @_FortranATrampolineAdjust(%[[HANDLE]]) : (!fir.llvm_ptr<i8>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[FPTR:.*]] = fir.convert %[[ADDR]]
+// CHECK: fir.call @_QPtest_proc_dummy_other(%[[FPTR]])
+// CHECK: fir.call @_FortranATrampolineFree(%[[HANDLE]])
+// CHECK: return
+
+func.func @_QPtest_proc_dummy() {
+ %c0_i32 = arith.constant 0 : i32
+ %c1_i32 = arith.constant 1 : i32
+ %c-1_i32 = arith.constant -1 : i32
+ %c5_i32 = arith.constant 5 : i32
+ %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_proc_dummyEi"}
+ %1 = fir.alloca tuple<!fir.ref<i32>>
+ %2 = fir.coordinate_of %1, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
+ fir.store %0 to %2 : !fir.llvm_ptr<!fir.ref<i32>>
+ fir.store %c1_i32 to %0 : !fir.ref<i32>
+ %3 = fir.address_of(@_QFtest_proc_dummyPtest_proc_dummy_a) : (!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> ()
+ %4 = fir.emboxproc %3, %1 : ((!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()>
+ fir.call @_QPtest_proc_dummy_other(%4) : (!fir.boxproc<() -> ()>) -> ()
+ %5 = fir.address_of(@_QQclX2E2F682E66393000) : !fir.ref<!fir.char<1,8>>
+ %6 = fir.convert %5 : (!fir.ref<!fir.char<1,8>>) -> !fir.ref<i8>
+ %7 = fir.call @_FortranAioBeginExternalListOutput(%c-1_i32, %6, %c5_i32) : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+ %8 = fir.load %0 : !fir.ref<i32>
+ %9 = fir.call @_FortranAioOutputInteger32(%7, %8) : (!fir.ref<i8>, i32) -> i1
+ %10 = fir.call @_FortranAioEndIoStatement(%7) : (!fir.ref<i8>) -> i32
+ return
+}
+func.func @_QFtest_proc_dummyPtest_proc_dummy_a(%arg0: !fir.ref<i32> {fir.bindc_name = "j"}, %arg1: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) {
+ %c0_i32 = arith.constant 0 : i32
+ %0 = fir.coordinate_of %arg1, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
+ %1 = fir.load %0 : !fir.llvm_ptr<!fir.ref<i32>>
+ %2 = fir.load %1 : !fir.ref<i32>
+ %3 = fir.load %arg0 : !fir.ref<i32>
+ %4 = arith.addi %2, %3 : i32
+ fir.store %4 to %1 : !fir.ref<i32>
+ return
+}
+func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
+ %c4_i32 = arith.constant 4 : i32
+ %0 = fir.alloca i32 {adapt.valuebyref}
+ fir.store %c4_i32 to %0 : !fir.ref<i32>
+ %1 = fir.box_addr %arg0 : (!fir.boxproc<() -> ()>) -> ((!fir.ref<i32>) -> ())
+ fir.call %1(%0) : (!fir.ref<i32>) -> ()
+ return
+}
+fir.global linkonce @_QQclX2E2F682E66393000 constant : !fir.char<1,8> {
+ %0 = fir.string_lit "./h.f90\00"(8) : !fir.char<1,8>
+ fir.has_value %0 : !fir.char<1,8>
+}
+func.func private @_FortranAioOutputInteger32(!fir.ref<i8>, i32) -> i1 attributes {fir.io, fir.runtime}
+func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref<i8>, i32) -> !fir.ref<i8> attributes {fir.io, fir.runtime}
+func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes {fir.io, fir.runtime}
+
+// Test that Init/Adjust are hoisted to the entry block when emboxproc
+// is inside a control flow branch, ensuring the handle dominates
+// the TrampolineFree at func.return.
+// CHECK-LABEL: func.func @_QPtest_branch
+// Init and Adjust should appear before the branch (hoisted to entry block).
+// CHECK: fir.call @_FortranATrampolineInit
+// CHECK: fir.call @_FortranATrampolineAdjust
+// CHECK: cf.cond_br
+// Free should appear before return.
+// CHECK: fir.call @_FortranATrampolineFree
+// CHECK: return
+
+func.func @_QPtest_branch(%arg0: i1) {
+ %c0_i32 = arith.constant 0 : i32
+ %0 = fir.alloca tuple<!fir.ref<i32>>
+ %1 = fir.alloca i32 {bindc_name = "x"}
+ %2 = fir.coordinate_of %0, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
+ fir.store %1 to %2 : !fir.llvm_ptr<!fir.ref<i32>>
+ %3 = fir.address_of(@_QPtest_branchPinternal) : (!fir.ref<tuple<!fir.ref<i32>>>) -> ()
+ cf.cond_br %arg0, ^bb1, ^bb2
+^bb1:
+ %4 = fir.emboxproc %3, %0 : ((!fir.ref<tuple<!fir.ref<i32>>>) -> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()>
+ fir.call @_QPdo_something(%4) : (!fir.boxproc<() -> ()>) -> ()
+ cf.br ^bb2
+^bb2:
+ return
+}
+func.func @_QPtest_branchPinternal(%arg0: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) {
+ return
+}
+func.func private @_QPdo_something(!fir.boxproc<() -> ()>)
diff --git a/flang/test/Lower/safe-trampoline.f90 b/flang/test/Lower/safe-trampoline.f90
new file mode 100644
index 0000000000000..ff28df107af82
--- /dev/null
+++ b/flang/test/Lower/safe-trampoline.f90
@@ -0,0 +1,126 @@
+! RUN: %flang_fc1 -fsafe-trampoline -emit-llvm -o - %s | FileCheck %s
+!
+! Test that -fsafe-trampoline generates calls to the runtime
+! trampoline pool instead of stack-based trampolines.
+!
+! Test cases cover trampolines at the top-level block, used inside an IF block,
+! and used inside a DO loop.
+
+! CHECK-LABEL: define {{.*}}@host_
+! CHECK: call {{.*}}@_FortranATrampolineInit
+! CHECK: call {{.*}}@_FortranATrampolineAdjust
+! CHECK: call {{.*}}@_FortranATrampolineFree
+
+! CHECK-LABEL: define {{.*}}@host_in_if_
+! CHECK: call {{.*}}@_FortranATrampolineInit
+! CHECK: call {{.*}}@_FortranATrampolineAdjust
+! CHECK: call {{.*}}@_FortranATrampolineFree
+
+! CHECK-LABEL: define {{.*}}@host_in_do_loop_
+! CHECK: call {{.*}}@_FortranATrampolineInit
+! CHECK: call {{.*}}@_FortranATrampolineAdjust
+! CHECK: call {{.*}}@_FortranATrampolineFree
+
+! CHECK-LABEL: define {{.*}}@host_branch_
+! CHECK: call {{.*}}@_FortranATrampolineInit
+! CHECK: call {{.*}}@_FortranATrampolineAdjust
+! CHECK: call {{.*}}@_FortranATrampolineFree
+
+module other
+ abstract interface
+ function callback()
+ integer :: callback
+ end function callback
+ end interface
+ contains
+ subroutine foo(fptr)
+ procedure(callback), pointer :: fptr
+ print *, fptr()
+ end subroutine foo
+ subroutine bar(fproc)
+ procedure(callback) :: fproc
+ print *, fproc()
+ end subroutine bar
+end module other
+
+! Test 1: trampoline at top-level block (baseline).
+subroutine host(local)
+ use other
+ integer :: local
+ procedure(callback), pointer :: fptr
+ fptr => callee
+ call foo(fptr)
+ return
+
+ contains
+
+ function callee()
+ integer :: callee
+ callee = local
+ end function callee
+end subroutine host
+
+! Test 2: trampoline used inside an IF block.
+subroutine host_in_if(local, flag)
+ use other
+ integer :: local
+ logical :: flag
+ procedure(callback), pointer :: fptr
+ fptr => callee
+ if (flag) then
+ call foo(fptr)
+ end if
+ return
+
+ contains
+
+ function callee()
+ integer :: callee
+ callee = local
+ end function callee
+end subroutine host_in_if
+
+! Test 3: trampoline used inside a DO loop.
+subroutine host_in_do_loop(local, n)
+ use other
+ integer :: local
+ integer :: n
+ integer :: i
+ procedure(callback), pointer :: fptr
+ fptr => callee
+ do i = 1, n
+ call foo(fptr)
+ end do
+ return
+
+ contains
+
+ function callee()
+ integer :: callee
+ callee = local + i
+ end function callee
+end subroutine host_in_do_loop
+
+! Test 4: emboxproc generated inside a branch (internal procedure passed
+! directly as actual argument inside an IF block).
+subroutine host_branch(local, flag)
+ use other
+ integer :: local
+ logical :: flag
+ if (flag) call bar(callee)
+ return
+
+ contains
+
+ function callee()
+ integer :: callee
+ callee = local
+ end function callee
+end subroutine host_branch
+
+program main
+ call host(10)
+ call host_in_if(20, .true.)
+ call host_in_do_loop(30, 3)
+ call host_branch(40, .true.)
+end program main
More information about the cfe-commits
mailing list