[flang-commits] [flang] [flang] Inline scalar-to-scalar TRANSFER for same-size trivial types (PR #191589)
Zhen Wang via flang-commits
flang-commits at lists.llvm.org
Mon Apr 13 12:02:53 PDT 2026
https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/191589
>From c05e74be71f9779d51868842272f20c793282057 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 10 Apr 2026 19:46:00 -0700
Subject: [PATCH 1/4] Inline scalar-to-scalar TRANSFER for same-size trivial
types
---
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 27 ++++++++-
flang/test/Lower/Intrinsics/transfer.f90 | 57 ++++++++++++++++---
2 files changed, 74 insertions(+), 10 deletions(-)
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d6dee88f422e0..6b040fd342ad1 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8680,6 +8680,31 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
assert(args.size() >= 2); // args.size() == 2 when size argument is omitted.
+ bool absentSize = (args.size() == 2);
+
+ // Inline scalar-to-scalar transfers when the result is a trivial type
+ // (integer, real, etc.) and both source and result have the same storage
+ // size.
+ if (absentSize && fir::isa_trivial(resultType)) {
+ mlir::Value sourceBase = fir::getBase(args[0]);
+ mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
+ if (fir::isa_ref_type(sourceBase.getType()) &&
+ !mlir::isa<fir::SequenceType>(sourceType)) {
+ auto &dl = builder.getDataLayout();
+ auto &kindMap = builder.getKindMap();
+ auto sourceSizeAndAlign =
+ fir::getTypeSizeAndAlignment(loc, sourceType, dl, kindMap);
+ auto resultSizeAndAlign =
+ fir::getTypeSizeAndAlignment(loc, resultType, dl, kindMap);
+ if (sourceSizeAndAlign && resultSizeAndAlign &&
+ sourceSizeAndAlign->first == resultSizeAndAlign->first) {
+ auto refTy = builder.getRefType(resultType);
+ auto cast = builder.createConvert(loc, refTy, sourceBase);
+ return fir::LoadOp::create(builder, loc, cast);
+ }
+ }
+ }
+
// Handle source argument
mlir::Value source = builder.createBox(loc, args[0]);
@@ -8688,8 +8713,6 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
fir::BoxValue moldTmp = mold;
unsigned moldRank = moldTmp.rank();
- bool absentSize = (args.size() == 2);
-
// Create mutable fir.box to be passed to the runtime for the result.
mlir::Type type = (moldRank == 0 && absentSize)
? resultType
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index 6a9ea14570fb3..e2c7b51143a17 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -3,17 +3,12 @@
subroutine trans_test(store, word)
! CHECK-LABEL: func @_QPtrans_test(
! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f32>{{.*}}) {
- ! CHECK-DAG: %[[RESULT_BOX:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
! CHECK-DAG: %[[store:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}{uniq_name = "_QFtrans_testEstore"}
! CHECK-DAG: %[[word:.*]]:2 = hlfir.declare %[[VAL_1]] {{.*}}{uniq_name = "_QFtrans_testEword"}
- ! CHECK: %[[VAL_3:.*]] = fir.embox %[[word]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
- ! CHECK: %[[VAL_4:.*]] = fir.embox %[[store]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
- ! CHECK: fir.call @_FortranATransfer({{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
- ! CHECK: %[[LOADED:.*]] = fir.load %[[RESULT_BOX]] : !fir.ref<!fir.box<!fir.heap<i32>>>
- ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[LOADED]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
- ! CHECK: %[[VAL:.*]] = fir.load %[[ADDR]] : !fir.heap<i32>
- ! CHECK: fir.freemem %[[ADDR]]
+ ! CHECK: %[[LOADED:.*]] = fir.load %[[word]]#0 : !fir.ref<f32>
+ ! CHECK: %[[VAL:.*]] = arith.bitcast %[[LOADED]] : f32 to i32
! CHECK: hlfir.assign %[[VAL]] to %[[store]]#0 : i32, !fir.ref<i32>
+ ! CHECK-NOT: fir.call @_FortranATransfer
! CHECK: return
! CHECK: }
integer :: store
@@ -54,3 +49,49 @@ integer function trans_test3(p)
t = transfer(p, t)
trans_test3 = t%x
end function
+
+ ! Scalar same-size transfer (f64 -> i64) is inlined as fir.load + arith.bitcast.
+ subroutine trans_test_r8_to_i8(store, word)
+ ! CHECK-LABEL: func @_QPtrans_test_r8_to_i8(
+ ! CHECK-SAME: %[[RES:.*]]: !fir.ref<i64>{{.*}}, %[[SRC:.*]]: !fir.ref<f64>{{.*}}) {
+ ! CHECK-DAG: %[[store:.*]]:2 = hlfir.declare %[[RES]] {{.*}}{uniq_name = "_QFtrans_test_r8_to_i8Estore"}
+ ! CHECK-DAG: %[[word:.*]]:2 = hlfir.declare %[[SRC]] {{.*}}{uniq_name = "_QFtrans_test_r8_to_i8Eword"}
+ ! CHECK: %[[LOADED:.*]] = fir.load %[[word]]#0 : !fir.ref<f64>
+ ! CHECK: %[[VAL:.*]] = arith.bitcast %[[LOADED]] : f64 to i64
+ ! CHECK: hlfir.assign %[[VAL]] to %[[store]]#0 : i64, !fir.ref<i64>
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ integer(8) :: store
+ real(8) :: word
+ store = transfer(word, store)
+ end subroutine
+
+ ! BIND(C) struct (c_ptr) to integer(8): same byte size, inlined via
+ ! address-level reinterpret. Covers the c_devptr pattern on CUDA device code.
+ subroutine trans_test_cptr_to_i8(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_cptr_to_i8(
+ ! CHECK: %[[srcDecl:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtrans_test_cptr_to_i8Esrc"}
+ ! CHECK: %[[storeDecl:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtrans_test_cptr_to_i8Estore"}
+ ! CHECK: %[[CAST:.*]] = fir.convert %[[srcDecl]]#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
+ ! CHECK: %[[VAL:.*]] = fir.load %[[CAST]] : !fir.ref<i64>
+ ! CHECK: hlfir.assign %[[VAL]] to %[[storeDecl]]#0 : i64, !fir.ref<i64>
+ ! CHECK-NOT: fir.call @_FortranATransfer
+ ! CHECK: return
+ ! CHECK: }
+ use iso_c_binding
+ integer(8) :: store
+ type(c_ptr) :: src
+ store = transfer(src, store)
+ end subroutine
+
+ ! Different-size scalar transfer (i32 -> i64) falls back to runtime.
+ subroutine trans_test_diff_size(store, src)
+ ! CHECK-LABEL: func @_QPtrans_test_diff_size(
+ ! CHECK: fir.call @_FortranATransfer(
+ ! CHECK: return
+ ! CHECK: }
+ integer(8) :: store
+ integer(4) :: src
+ store = transfer(src, store)
+ end subroutine
>From d2747e1a20f91d8d9a7c94df0c1e17b911e08b16 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 10 Apr 2026 19:56:07 -0700
Subject: [PATCH 2/4] code reorg
---
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6b040fd342ad1..fd67ef93e752f 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8690,16 +8690,14 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
if (fir::isa_ref_type(sourceBase.getType()) &&
!mlir::isa<fir::SequenceType>(sourceType)) {
- auto &dl = builder.getDataLayout();
- auto &kindMap = builder.getKindMap();
- auto sourceSizeAndAlign =
- fir::getTypeSizeAndAlignment(loc, sourceType, dl, kindMap);
- auto resultSizeAndAlign =
- fir::getTypeSizeAndAlignment(loc, resultType, dl, kindMap);
+ auto sourceSizeAndAlign = fir::getTypeSizeAndAlignment(
+ loc, sourceType, builder.getDataLayout(), builder.getKindMap());
+ auto resultSizeAndAlign = fir::getTypeSizeAndAlignment(
+ loc, resultType, builder.getDataLayout(), builder.getKindMap());
if (sourceSizeAndAlign && resultSizeAndAlign &&
sourceSizeAndAlign->first == resultSizeAndAlign->first) {
- auto refTy = builder.getRefType(resultType);
- auto cast = builder.createConvert(loc, refTy, sourceBase);
+ mlir::Type refTy = builder.getRefType(resultType);
+ mlir::Value cast = builder.createConvert(loc, refTy, sourceBase);
return fir::LoadOp::create(builder, loc, cast);
}
}
>From b2e49372e1705da5bbeddf8be1b56fd1cf6233d0 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Mon, 13 Apr 2026 11:28:33 -0700
Subject: [PATCH 3/4] Use arith.bitcast for trivial-to-trivial transfer instead
of address-level type punning
For scalar transfers where both source and result are integer or float types, emit fir.load + arith.bitcast instead of fir.convert on the ref type + fir.load. This produces cleaner value-level IR that directly expresses bit reinterpretation semantics. The address-level approach is retained for non-trivial source types (e.g., c_ptr to integer).
---
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index fd67ef93e752f..7d744e49c0695 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8696,6 +8696,14 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
loc, resultType, builder.getDataLayout(), builder.getKindMap());
if (sourceSizeAndAlign && resultSizeAndAlign &&
sourceSizeAndAlign->first == resultSizeAndAlign->first) {
+ if (mlir::isa<mlir::IntegerType, mlir::FloatType>(sourceType) &&
+ mlir::isa<mlir::IntegerType, mlir::FloatType>(resultType)) {
+ mlir::Value val = fir::LoadOp::create(builder, loc, sourceBase);
+ if (sourceType != resultType)
+ val =
+ mlir::arith::BitcastOp::create(builder, loc, resultType, val);
+ return val;
+ }
mlir::Type refTy = builder.getRefType(resultType);
mlir::Value cast = builder.createConvert(loc, refTy, sourceBase);
return fir::LoadOp::create(builder, loc, cast);
>From ba390d3535006198c572fa86d4fb90eb4fb865af Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Mon, 13 Apr 2026 12:02:35 -0700
Subject: [PATCH 4/4] format
---
flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 7d744e49c0695..055d4567b6b5b 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8700,8 +8700,7 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
mlir::isa<mlir::IntegerType, mlir::FloatType>(resultType)) {
mlir::Value val = fir::LoadOp::create(builder, loc, sourceBase);
if (sourceType != resultType)
- val =
- mlir::arith::BitcastOp::create(builder, loc, resultType, val);
+ val = mlir::arith::BitcastOp::create(builder, loc, resultType, val);
return val;
}
mlir::Type refTy = builder.getRefType(resultType);
More information about the flang-commits
mailing list