[flang-commits] [flang] [flang] Inline scalar-to-scalar TRANSFER for same-size trivial types (PR #191589)
Zhen Wang via flang-commits
flang-commits at lists.llvm.org
Fri Apr 10 19:48:06 PDT 2026
https://github.com/wangzpgi created https://github.com/llvm/llvm-project/pull/191589
Inline the TRANSFER intrinsic for scalar-to-scalar cases where the result is a trivial type (integer, real, etc.) and source and result have the same storage size. Instead of calling _FortranATransfer, the lowering now emits a fir.convert on the source address followed by a fir.load, effectively performing a reinterpret cast.
>From 4d2e8daa192c321f00486d0e44ba222926c67739 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 10 Apr 2026 19:46:00 -0700
Subject: [PATCH] Inline scalar-to-scalar TRANSFER for same-size trivial types
---
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 27 +++++++-
flang/test/Lower/Intrinsics/transfer.f90 | 64 +++++++++++++------
2 files changed, 71 insertions(+), 20 deletions(-)
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d6dee88f422e0..6b040fd342ad1 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8680,6 +8680,31 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
assert(args.size() >= 2); // args.size() == 2 when size argument is omitted.
+ bool absentSize = (args.size() == 2);
+
+ // Inline scalar-to-scalar transfers when the result is a trivial type
+ // (integer, real, etc.) and both source and result have the same storage
+ // size.
+ if (absentSize && fir::isa_trivial(resultType)) {
+ mlir::Value sourceBase = fir::getBase(args[0]);
+ mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
+ if (fir::isa_ref_type(sourceBase.getType()) &&
+ !mlir::isa<fir::SequenceType>(sourceType)) {
+ auto &dl = builder.getDataLayout();
+ auto &kindMap = builder.getKindMap();
+ auto sourceSizeAndAlign =
+ fir::getTypeSizeAndAlignment(loc, sourceType, dl, kindMap);
+ auto resultSizeAndAlign =
+ fir::getTypeSizeAndAlignment(loc, resultType, dl, kindMap);
+ if (sourceSizeAndAlign && resultSizeAndAlign &&
+ sourceSizeAndAlign->first == resultSizeAndAlign->first) {
+ auto refTy = builder.getRefType(resultType);
+ auto cast = builder.createConvert(loc, refTy, sourceBase);
+ return fir::LoadOp::create(builder, loc, cast);
+ }
+ }
+ }
+
// Handle source argument
mlir::Value source = builder.createBox(loc, args[0]);
@@ -8688,8 +8713,6 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
fir::BoxValue moldTmp = mold;
unsigned moldRank = moldTmp.rank();
- bool absentSize = (args.size() == 2);
-
// Create mutable fir.box to be passed to the runtime for the result.
mlir::Type type = (moldRank == 0 && absentSize)
? resultType
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index a792c8e91ba01..a44a1e645c813 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -3,24 +3,9 @@
subroutine trans_test(store, word)
! CHECK-LABEL: func @_QPtrans_test(
! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f32>{{.*}}) {
- ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
- ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1]] : (!fir.ref<f32>) -> !fir.box<f32>
- ! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_0]] : (!fir.ref<i32>) -> !fir.box<i32>
- ! CHECK: %[[VAL_5:.*]] = fir.zero_bits !fir.heap<i32>
- ! CHECK: %[[VAL_6:.*]] = fir.embox %[[VAL_5]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
- ! CHECK: fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>
- ! CHECK: %[[VAL_7:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
- ! CHECK: %[[VAL_8:.*]] = arith.constant {{.*}} : i32
- ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
- ! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_3]] : (!fir.box<f32>) -> !fir.box<none>
- ! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_4]] : (!fir.box<i32>) -> !fir.box<none>
- ! CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
- ! CHECK: fir.call @_FortranATransfer(%[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
- ! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>
- ! CHECK: %[[VAL_15:.*]] = fir.box_addr %[[VAL_14]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
- ! CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.heap<i32>
- ! CHECK: fir.freemem %[[VAL_15]]
- ! CHECK: fir.store %[[VAL_16]] to %[[VAL_0]] : !fir.ref<i32>
+ ! CHECK: %[[CAST:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<f32>) -> !fir.ref<i32>
+ ! CHECK: %[[VAL:.*]] = fir.load %[[CAST]] : !fir.ref<i32>
+ ! CHECK: fir.store %[[VAL]] to %[[VAL_0]] : !fir.ref<i32>
! CHECK: return
! CHECK: }
integer :: store
@@ -120,3 +105,46 @@ integer function trans_test3(p)
t = transfer(p, t)
trans_test3 = t%x
end function
+
+ ! Scalar same-size transfer (f64 -> i64) is inlined as fir.convert + fir.load.
+ subroutine trans_test_r8_to_i8(store, word)
+ ! CHECK-LABEL: func @_QPtrans_test_r8_to_i8(
+ ! CHECK-SAME: %[[RES:.*]]: !fir.ref<i64>{{.*}}, %[[SRC:.*]]: !fir.ref<f64>{{.*}}) {
+ ! CHECK: %[[CAST:.*]] = fir.convert %[[SRC]] : (!fir.ref<f64>) -> !fir.ref<i64>
+ ! CHECK: %[[VAL:.*]] = fir.load %[[CAST]] : !fir.ref<i64>
+ ! CHECK: fir.store %[[VAL]] to %[[RES]] : !fir.ref<i64>
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ integer(8) :: store
+ real(8) :: word
+ store = transfer(word, store)
+ end subroutine
+
+ ! BIND(C) struct (c_ptr) to integer(8): same byte size, inlined.
+ ! This covers the c_devptr pattern on CUDA device code.
+ subroutine trans_test_cptr_to_i8(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_cptr_to_i8(
+ ! CHECK-SAME: %[[RES:.*]]: !fir.ref<i64>{{.*}}, %[[SRC:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>{{.*}}) {
+ ! CHECK: %[[CAST:.*]] = fir.convert %[[SRC]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
+ ! CHECK: %[[VAL:.*]] = fir.load %[[CAST]] : !fir.ref<i64>
+ ! CHECK: fir.store %[[VAL]] to %[[RES]] : !fir.ref<i64>
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ use iso_c_binding
+ integer(8) :: store
+ type(c_ptr) :: src
+ store = transfer(src, store)
+ end subroutine
+
+ ! Different-size scalar transfer (i32 -> i64) falls back to runtime.
+ subroutine trans_test_diff_size(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_diff_size(
+ ! CHECK: fir.call @_FortranATransfer(
+ ! CHECK: return
+ ! CHECK: }
+ integer(8) :: store
+ integer(4) :: src
+ store = transfer(src, store)
+ end subroutine
More information about the flang-commits
mailing list