[flang] [llvm] [Flang] Adjust the trampoline size for AArch64 and PPC (PR #118678)

Wed Jan 22 11:10:06 PST 2025

https://github.com/ssijaric-nv updated https://github.com/llvm/llvm-project/pull/118678

>From 1e36021619625102ad909f41f1c0bb65bbd1ab8c Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 09:36:55 -0800
Subject: [PATCH 1/5] [Flang] Adjust the trampoline size for AArch64 and PPC

The trampoline size is 36 bytes for AArch64, 40 bytes for PPC32 and 48 bytes
for PPC64.  During AArch64 and PPC lowering, init.trampoline is lowered
to a call to __trampoline_setup, with the corresponding trampoline sizes
passed to it.
---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 10 +++++++++-
 flang/test/Fir/boxproc.fir                     | 12 +++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index ad7272eaa9d3f3..747de8e0ea26f6 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -272,10 +272,18 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
+            const auto triple{fir::getTargetTriple(builder.getModule())};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            mlir::Type buffTy = SequenceType::get({32}, i8Ty);
+            fir::SequenceType::Extent thunkSize = 32;
+            if (triple.isPPC32())
+              thunkSize = 40;
+            else if (triple.isPPC64())
+              thunkSize = 48;
+            else if (triple.isAArch64())
+               thunkSize = 36;
+            mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =
                 builder.createConvert(loc, i8Ptr, embox.getHost());
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 27d8953236e720..d5d78593dc8a74 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -1,7 +1,11 @@
-// RUN: tco %s | FileCheck %s
+// RUN: %if aarch64-registered-target %{tco --target=aarch64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-AARCH64 %}
+// RUN: %if x86-registered-target %{tco --target=x86_64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-X86 %}
+// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK:         %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
 // CHECK:         %[[VAL_0:.*]] = alloca i32, i64 1, align 4
 // CHECK:         %[[VAL_2:.*]] = getelementptr { ptr }, ptr %[[VAL_1]], i32 0, i32 0
@@ -59,7 +63,9 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK:         %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
 // CHECK:         %[[VAL_1:.*]] = alloca [10 x i8], i64 1, align 1
 // CHECK:         %[[VAL_0:.*]] = alloca [40 x i8], i64 1, align 1

>From ae86c20e0258e164bcc522ea1741df61cc1910f6 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 10:33:51 -0800
Subject: [PATCH 2/5] Fix formatting

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 747de8e0ea26f6..56019dea875b78 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -282,7 +282,7 @@ class BoxedProcedurePass
             else if (triple.isPPC64())
               thunkSize = 48;
             else if (triple.isAArch64())
-               thunkSize = 36;
+              thunkSize = 36;
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =

>From 49f3b4d081ec5e35497ac7d196e9100263f908c8 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 13:59:38 -0800
Subject: [PATCH 3/5] Minor cosmetic change

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 56019dea875b78..2dabf69528c9f0 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -272,7 +272,7 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
-            const auto triple{fir::getTargetTriple(builder.getModule())};
+            const auto triple{fir::getTargetTriple(module)};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);

>From bd1d766e3e47226edf7a36fe061009b63687e308 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Thu, 5 Dec 2024 17:51:59 -0800
Subject: [PATCH 4/5] Add comments

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 2dabf69528c9f0..55872dc85c1f38 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -276,6 +276,12 @@ class BoxedProcedurePass
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
+            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // __trampoline_setup, which is defined in
+            // compiler-rt/lib/builtins/trampoline_setup.c and requires the
+            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
+            // thunk setup doesn't go through __trampoline_setup and fits in 32
+            // bytes.
             fir::SequenceType::Extent thunkSize = 32;
             if (triple.isPPC32())
               thunkSize = 40;

>From 2191d54be46b4168d915ad12d5d00aa600152160 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 11 Dec 2024 17:40:34 -0800
Subject: [PATCH 5/5] Move the thunk size determination to the Triple

---
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  |  8 +-------
 llvm/include/llvm/TargetParser/Triple.h       |  3 +++
 llvm/lib/TargetParser/Triple.cpp              | 20 +++++++++++++++++++
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 55872dc85c1f38..f162cddaff34d7 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -282,13 +282,7 @@ class BoxedProcedurePass
             // thunk size greater than 32 bytes.  For RISCV and x86_64, the
             // thunk setup doesn't go through __trampoline_setup and fits in 32
             // bytes.
-            fir::SequenceType::Extent thunkSize = 32;
-            if (triple.isPPC32())
-              thunkSize = 40;
-            else if (triple.isPPC64())
-              thunkSize = 48;
-            else if (triple.isAArch64())
-              thunkSize = 36;
+            fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 8097300c6e630c..ed6f48fba788b1 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -498,6 +498,9 @@ class Triple {
     return getArchPointerBitWidth(getArch());
   }
 
+  /// Returns the trampoline size in bytes for this configuration.
+  unsigned getTrampolineSize() const;
+
   /// Test whether the architecture is 64-bit
   ///
   /// Note that this tests for 64-bit pointer width, and nothing else. Note
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ed58e72089839b..e9e6f130f757cf 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1711,6 +1711,26 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   llvm_unreachable("Invalid architecture value");
 }
 
+unsigned Triple::getTrampolineSize() const {
+  switch (getArch()) {
+  default:
+    break;
+  case Triple::ppc:
+  case Triple::ppcle:
+    if (isOSLinux())
+      return 40;
+    break;
+  case Triple::ppc64:
+  case Triple::ppc64le:
+    if (isOSLinux())
+      return 48;
+    break;
+  case Triple::aarch64:
+    return 36;
+  }
+  return 32;
+}
+
 bool Triple::isArch64Bit() const {
   return getArchPointerBitWidth(getArch()) == 64;
 }