[flang-commits] [flang] ca3bc44 - [flang] Inline scalar-to-scalar TRANSFER for same-size trivial types (#191589)
via flang-commits
flang-commits at lists.llvm.org
Thu Apr 16 12:10:53 PDT 2026
Author: Zhen Wang
Date: 2026-04-16T19:10:47Z
New Revision: ca3bc44c3090481615bd8fc4b3e64358b845c8bf
URL: https://github.com/llvm/llvm-project/commit/ca3bc44c3090481615bd8fc4b3e64358b845c8bf
DIFF: https://github.com/llvm/llvm-project/commit/ca3bc44c3090481615bd8fc4b3e64358b845c8bf.diff
LOG: [flang] Inline scalar-to-scalar TRANSFER for same-size trivial types (#191589)
Inline the TRANSFER intrinsic for scalar-to-scalar cases where the
result is a trivial type (integer, real, etc.) and source and result
have the same storage size. Instead of calling _FortranATransfer, the
lowering now emits a fir.convert on the source address followed by a
fir.load, effectively performing a reinterpret cast.
Added:
Modified:
flang/lib/Optimizer/Builder/IntrinsicCall.cpp
flang/test/Lower/Intrinsics/transfer.f90
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d6dee88f422e0..3623323e8cf3d 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8680,6 +8680,39 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
assert(args.size() >= 2); // args.size() == 2 when size argument is omitted.
+ bool absentSize = (args.size() == 2);
+
+ // Inline scalar-to-scalar transfers when the result is a trivial type
+ // (integer, real, etc.) and both source and result have the same storage
+ // size.
+ if (absentSize && fir::isa_trivial(resultType)) {
+ mlir::Value sourceBase = fir::getBase(args[0]);
+ mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
+ mlir::Type moldType = fir::unwrapRefType(fir::getBase(args[1]).getType());
+ if (fir::isa_ref_type(sourceBase.getType()) &&
+ (fir::isa_trivial(sourceType) ||
+ mlir::isa<fir::RecordType>(sourceType)) &&
+ fir::isa_trivial(moldType)) {
+ auto sourceSizeAndAlign = fir::getTypeSizeAndAlignment(
+ loc, sourceType, builder.getDataLayout(), builder.getKindMap());
+ auto resultSizeAndAlign = fir::getTypeSizeAndAlignment(
+ loc, resultType, builder.getDataLayout(), builder.getKindMap());
+ if (sourceSizeAndAlign && resultSizeAndAlign &&
+ sourceSizeAndAlign->first == resultSizeAndAlign->first) {
+ if (mlir::isa<mlir::IntegerType, mlir::FloatType>(sourceType) &&
+ mlir::isa<mlir::IntegerType, mlir::FloatType>(resultType)) {
+ mlir::Value val = fir::LoadOp::create(builder, loc, sourceBase);
+ if (sourceType != resultType)
+ val = mlir::arith::BitcastOp::create(builder, loc, resultType, val);
+ return val;
+ }
+ mlir::Type refTy = builder.getRefType(resultType);
+ mlir::Value cast = builder.createConvert(loc, refTy, sourceBase);
+ return fir::LoadOp::create(builder, loc, cast);
+ }
+ }
+ }
+
// Handle source argument
mlir::Value source = builder.createBox(loc, args[0]);
@@ -8688,8 +8721,6 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
fir::BoxValue moldTmp = mold;
unsigned moldRank = moldTmp.rank();
- bool absentSize = (args.size() == 2);
-
// Create mutable fir.box to be passed to the runtime for the result.
mlir::Type type = (moldRank == 0 && absentSize)
? resultType
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index 6a9ea14570fb3..7afdfd28c2ae1 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -3,17 +3,12 @@
subroutine trans_test(store, word)
! CHECK-LABEL: func @_QPtrans_test(
! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f32>{{.*}}) {
- ! CHECK-DAG: %[[RESULT_BOX:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
! CHECK-DAG: %[[store:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}{uniq_name = "_QFtrans_testEstore"}
! CHECK-DAG: %[[word:.*]]:2 = hlfir.declare %[[VAL_1]] {{.*}}{uniq_name = "_QFtrans_testEword"}
- ! CHECK: %[[VAL_3:.*]] = fir.embox %[[word]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
- ! CHECK: %[[VAL_4:.*]] = fir.embox %[[store]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
- ! CHECK: fir.call @_FortranATransfer({{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
- ! CHECK: %[[LOADED:.*]] = fir.load %[[RESULT_BOX]] : !fir.ref<!fir.box<!fir.heap<i32>>>
- ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[LOADED]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
- ! CHECK: %[[VAL:.*]] = fir.load %[[ADDR]] : !fir.heap<i32>
- ! CHECK: fir.freemem %[[ADDR]]
+ ! CHECK: %[[LOADED:.*]] = fir.load %[[word]]#0 : !fir.ref<f32>
+ ! CHECK: %[[VAL:.*]] = arith.bitcast %[[LOADED]] : f32 to i32
! CHECK: hlfir.assign %[[VAL]] to %[[store]]#0 : i32, !fir.ref<i32>
+ ! CHECK-NOT: fir.call @_FortranATransfer
! CHECK: return
! CHECK: }
integer :: store
@@ -54,3 +49,105 @@ integer function trans_test3(p)
t = transfer(p, t)
trans_test3 = t%x
end function
+
+ ! Scalar same-size transfer (f64 -> i64) is inlined as fir.load + arith.bitcast.
+ subroutine trans_test_r8_to_i8(store, word)
+ ! CHECK-LABEL: func @_QPtrans_test_r8_to_i8(
+ ! CHECK-SAME: %[[RES:.*]]: !fir.ref<i64>{{.*}}, %[[SRC:.*]]: !fir.ref<f64>{{.*}}) {
+ ! CHECK-DAG: %[[store:.*]]:2 = hlfir.declare %[[RES]] {{.*}}{uniq_name = "_QFtrans_test_r8_to_i8Estore"}
+ ! CHECK-DAG: %[[word:.*]]:2 = hlfir.declare %[[SRC]] {{.*}}{uniq_name = "_QFtrans_test_r8_to_i8Eword"}
+ ! CHECK: %[[LOADED:.*]] = fir.load %[[word]]#0 : !fir.ref<f64>
+ ! CHECK: %[[VAL:.*]] = arith.bitcast %[[LOADED]] : f64 to i64
+ ! CHECK: hlfir.assign %[[VAL]] to %[[store]]#0 : i64, !fir.ref<i64>
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ integer(8) :: store
+ real(8) :: word
+ store = transfer(word, store)
+ end subroutine
+
+ ! BIND(C) struct (c_ptr) to integer(8): same byte size, inlined via
+ ! address-level reinterpret. Covers the c_devptr pattern on CUDA device code.
+ subroutine trans_test_cptr_to_i8(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_cptr_to_i8(
+ ! CHECK: %[[srcDecl:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtrans_test_cptr_to_i8Esrc"}
+ ! CHECK: %[[storeDecl:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtrans_test_cptr_to_i8Estore"}
+ ! CHECK: %[[CAST:.*]] = fir.convert %[[srcDecl]]#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
+ ! CHECK: %[[VAL:.*]] = fir.load %[[CAST]] : !fir.ref<i64>
+ ! CHECK: hlfir.assign %[[VAL]] to %[[storeDecl]]#0 : i64, !fir.ref<i64>
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ use iso_c_binding
+ integer(8) :: store
+ type(c_ptr) :: src
+ store = transfer(src, store)
+ end subroutine
+
+ ! Different-size scalar transfer (i32 -> i64) falls back to runtime.
+ subroutine trans_test_
diff _size(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_
diff _size(
+ ! CHECK: fir.call @_FortranATransfer(
+ ! CHECK: return
+ ! CHECK: }
+ integer(8) :: store
+ integer(4) :: src
+ store = transfer(src, store)
+ end subroutine
+
+ ! Array mold without SIZE: result is rank-1 array, must use runtime.
+ subroutine trans_test_array_mold(src, result)
+ ! CHECK-LABEL: func @_QPtrans_test_array_mold(
+ ! CHECK: fir.call @_FortranATransfer(
+ ! CHECK: return
+ ! CHECK: }
+ real :: src
+ integer, allocatable :: result(:)
+ integer :: mold(4)
+ result = transfer(src, mold)
+ end subroutine
+
+ ! Allocatable mold: must use runtime.
+ subroutine trans_test_alloc_mold(src, result)
+ ! CHECK-LABEL: func @_QPtrans_test_alloc_mold(
+ ! CHECK: fir.call @_FortranATransfer(
+ ! CHECK: return
+ ! CHECK: }
+ real :: src
+ integer, allocatable :: mold(:)
+ integer, allocatable :: result(:)
+ result = transfer(src, mold)
+ end subroutine
+
+ ! POINTER source: descriptor is unpacked before reaching genTransfer,
+ ! so the inline optimization applies.
+ subroutine trans_test_pointer_source(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_pointer_source(
+ ! CHECK: fir.load {{.*}} : !fir.ref<!fir.box<!fir.ptr<f32>>>
+ ! CHECK: fir.box_addr
+ ! CHECK: %[[VAL:.*]] = fir.load {{.*}} : !fir.ptr<f32>
+ ! CHECK: arith.bitcast %[[VAL]] : f32 to i32
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ integer :: store
+ real, pointer :: src
+ store = transfer(src, store)
+ end subroutine
+
+ ! ALLOCATABLE source: descriptor is unpacked before reaching genTransfer,
+ ! so the inline optimization applies.
+ subroutine trans_test_alloc_source(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_alloc_source(
+ ! CHECK: fir.load {{.*}} : !fir.ref<!fir.box<!fir.heap<f32>>>
+ ! CHECK: fir.box_addr
+ ! CHECK: %[[VAL:.*]] = fir.load {{.*}} : !fir.heap<f32>
+ ! CHECK: arith.bitcast %[[VAL]] : f32 to i32
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ integer :: store
+ real, allocatable :: src
+ store = transfer(src, store)
+ end subroutine
More information about the flang-commits
mailing list