[flang-commits] [flang] [Flang] Adjust the trampoline size for AArch64 and PPC (PR #118678)

Thu Dec 5 18:10:31 PST 2024

https://github.com/ssijaric-nv updated https://github.com/llvm/llvm-project/pull/118678

>From d85342a75718f644fe3e36fbe6e3733326761d26 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 09:36:55 -0800
Subject: [PATCH 1/4] [Flang] Adjust the trampoline size for AArch64 and PPC

The trampoline size is 36 bytes for AArch64, 40 bytes for PPC32 and 48 bytes
for PPC64.  During AArch64 and PPC lowering, init.trampoline is lowered
to a call to __trampoline_setup, with the corresponding trampoline sizes
passed to it.
---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 10 +++++++++-
 flang/test/Fir/boxproc.fir                     | 12 +++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index c536fd19fcc69a..a942c80bf4e934 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -268,10 +268,18 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
+            const auto triple{fir::getTargetTriple(builder.getModule())};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            mlir::Type buffTy = SequenceType::get({32}, i8Ty);
+            fir::SequenceType::Extent thunkSize = 32;
+            if (triple.isPPC32())
+              thunkSize = 40;
+            else if (triple.isPPC64())
+              thunkSize = 48;
+            else if (triple.isAArch64())
+               thunkSize = 36;
+            mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =
                 builder.createConvert(loc, i8Ptr, embox.getHost());
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 9e4ea0bc210775..fbc1329892cbc7 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -1,7 +1,11 @@
-// RUN: tco %s | FileCheck %s
+// RUN: %if aarch64-registered-target %{tco --target=aarch64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-AARCH64 %}
+// RUN: %if x86-registered-target %{tco --target=x86_64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-X86 %}
+// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK:         %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
 // CHECK:         %[[VAL_0:.*]] = alloca i32, i64 1, align 4
 // CHECK:         %[[VAL_2:.*]] = getelementptr { ptr }, ptr %[[VAL_1]], i32 0, i32 0
@@ -59,7 +63,9 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK:         %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
 // CHECK:         %[[VAL_1:.*]] = alloca [10 x i8], i64 1, align 1
 // CHECK:         %[[VAL_0:.*]] = alloca [40 x i8], i64 1, align 1

>From 3c79e5814d030085b543e7e304efe4235f44d590 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 10:33:51 -0800
Subject: [PATCH 2/4] Fix formatting

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index a942c80bf4e934..44b6140655163b 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -278,7 +278,7 @@ class BoxedProcedurePass
             else if (triple.isPPC64())
               thunkSize = 48;
             else if (triple.isAArch64())
-               thunkSize = 36;
+              thunkSize = 36;
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =

>From ab81ef598a910cb9e3f460a61c5e01cfb1cb00c6 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 4 Dec 2024 13:59:38 -0800
Subject: [PATCH 3/4] Minor cosmetic change

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 44b6140655163b..e1c9e97d0b535b 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -268,7 +268,7 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
-            const auto triple{fir::getTargetTriple(builder.getModule())};
+            const auto triple{fir::getTargetTriple(module)};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);

>From 5e5b89e6a552ec71ff3a6afbf044ec0826320e6c Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Thu, 5 Dec 2024 17:51:59 -0800
Subject: [PATCH 4/4] Add comments

---
 flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index e1c9e97d0b535b..7419a9e339d468 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -272,6 +272,12 @@ class BoxedProcedurePass
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
+            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // __trampoline_setup, which is defined in
+            // compiler-rt/lib/builtins/trampoline_setup.c and requires the
+            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
+            // thunk setup doesn't go through __trampoline_setup and fits in 32
+            // bytes.
             fir::SequenceType::Extent thunkSize = 32;
             if (triple.isPPC32())
               thunkSize = 40;