[flang-commits] [flang] [llvm] [Flang] Adjust the trampoline size for AArch64 and PPC (PR #118678)

Tue Jan 21 20:17:14 PST 2025

https://github.com/ssijaric-nv updated https://github.com/llvm/llvm-project/pull/118678

>From 68ec0b31cd6393c29c0462fa2cb09ceac4aed5c8 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 09:36:55 -0800
Subject: [PATCH 1/5] [Flang] Adjust the trampoline size for AArch64 and PPC

The trampoline size is 36 bytes for AArch64, 40 bytes for PPC32 and 48 bytes
for PPC64.  During AArch64 and PPC lowering, init.trampoline is lowered
to a call to __trampoline_setup, with the corresponding trampoline sizes
passed to it.
---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 10 +++++++++-
 flang/test/Fir/boxproc.fir                     | 12 +++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index ad7272eaa9d3f3..747de8e0ea26f6 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -272,10 +272,18 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
+            const auto triple{fir::getTargetTriple(builder.getModule())};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            mlir::Type buffTy = SequenceType::get({32}, i8Ty);
+            fir::SequenceType::Extent thunkSize = 32;
+            if (triple.isPPC32())
+              thunkSize = 40;
+            else if (triple.isPPC64())
+              thunkSize = 48;
+            else if (triple.isAArch64())
+               thunkSize = 36;
+            mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =
                 builder.createConvert(loc, i8Ptr, embox.getHost());
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 27d8953236e720..d5d78593dc8a74 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -1,7 +1,11 @@
-// RUN: tco %s | FileCheck %s
+// RUN: %if aarch64-registered-target %{tco --target=aarch64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-AARCH64 %}
+// RUN: %if x86-registered-target %{tco --target=x86_64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-X86 %}
+// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK:         %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
 // CHECK:         %[[VAL_0:.*]] = alloca i32, i64 1, align 4
 // CHECK:         %[[VAL_2:.*]] = getelementptr { ptr }, ptr %[[VAL_1]], i32 0, i32 0
@@ -59,7 +63,9 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK:         %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
 // CHECK:         %[[VAL_1:.*]] = alloca [10 x i8], i64 1, align 1
 // CHECK:         %[[VAL_0:.*]] = alloca [40 x i8], i64 1, align 1

>From 5afab2893645711316d9f50aad0e0eab4e040969 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 10:33:51 -0800
Subject: [PATCH 2/5] Fix formatting

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 747de8e0ea26f6..56019dea875b78 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -282,7 +282,7 @@ class BoxedProcedurePass
             else if (triple.isPPC64())
               thunkSize = 48;
             else if (triple.isAArch64())
-               thunkSize = 36;
+              thunkSize = 36;
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =

>From 6d767c263a82b7de2163bb4253196e5af23d02f5 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 13:59:38 -0800
Subject: [PATCH 3/5] Minor cosmetic change

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 56019dea875b78..2dabf69528c9f0 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -272,7 +272,7 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
-            const auto triple{fir::getTargetTriple(builder.getModule())};
+            const auto triple{fir::getTargetTriple(module)};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);

>From 7b36b827d8f1e05d6357d53eaffe2a9fbc9f2f4c Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Thu, 5 Dec 2024 17:51:59 -0800
Subject: [PATCH 4/5] Add comments

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 2dabf69528c9f0..55872dc85c1f38 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -276,6 +276,12 @@ class BoxedProcedurePass
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
+            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // __trampoline_setup, which is defined in
+            // compiler-rt/lib/builtins/trampoline_setup.c and requires the
+            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
+            // thunk setup doesn't go through __trampoline_setup and fits in 32
+            // bytes.
             fir::SequenceType::Extent thunkSize = 32;
             if (triple.isPPC32())
               thunkSize = 40;

>From 273c5edd069416b4a2f89539dd4873a510727d14 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 11 Dec 2024 17:40:34 -0800
Subject: [PATCH 5/5] Move the thunk size determination to the Triple

---
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  |  8 +-------
 llvm/include/llvm/TargetParser/Triple.h       |  3 +++
 llvm/lib/TargetParser/Triple.cpp              | 20 +++++++++++++++++++
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 55872dc85c1f38..f162cddaff34d7 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -282,13 +282,7 @@ class BoxedProcedurePass
             // thunk size greater than 32 bytes.  For RISCV and x86_64, the
             // thunk setup doesn't go through __trampoline_setup and fits in 32
             // bytes.
-            fir::SequenceType::Extent thunkSize = 32;
-            if (triple.isPPC32())
-              thunkSize = 40;
-            else if (triple.isPPC64())
-              thunkSize = 48;
-            else if (triple.isAArch64())
-              thunkSize = 36;
+            fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 8097300c6e630c..ed6f48fba788b1 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -498,6 +498,9 @@ class Triple {
     return getArchPointerBitWidth(getArch());
   }
 
+  /// Returns the trampoline size in bytes for this configuration.
+  unsigned getTrampolineSize() const;
+
   /// Test whether the architecture is 64-bit
   ///
   /// Note that this tests for 64-bit pointer width, and nothing else. Note
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ed58e72089839b..e9e6f130f757cf 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1711,6 +1711,26 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   llvm_unreachable("Invalid architecture value");
 }
 
+unsigned Triple::getTrampolineSize() const {
+  switch (getArch()) {
+  default:
+    break;
+  case Triple::ppc:
+  case Triple::ppcle:
+    if (isOSLinux())
+      return 40;
+    break;
+  case Triple::ppc64:
+  case Triple::ppc64le:
+    if (isOSLinux())
+      return 48;
+    break;
+  case Triple::aarch64:
+    return 36;
+  }
+  return 32;
+}
+
 bool Triple::isArch64Bit() const {
   return getArchPointerBitWidth(getArch()) == 64;
 }