[llvm-branch-commits] [flang] 61999d7 - Revert "[flang][cuda] Specialize entry point for scalar to desc data transfer…"

Fri Nov 15 17:44:38 PST 2024

Author: Valentin Clement (バレンタイン クレメン)
Date: 2024-11-15T17:44:34-08:00
New Revision: 61999d7f2a3f86a0633fc3033805447f1508e6da

URL: https://github.com/llvm/llvm-project/commit/61999d7f2a3f86a0633fc3033805447f1508e6da
DIFF: https://github.com/llvm/llvm-project/commit/61999d7f2a3f86a0633fc3033805447f1508e6da.diff

LOG: Revert "[flang][cuda] Specialize entry point for scalar to desc data transfer…"

This reverts commit 43cb424a54c9452b60d96ef07d0699fc3b1ceb87.

Added: 
    

Modified: 
    flang/include/flang/Runtime/CUDA/memory.h
    flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
    flang/runtime/CUDA/memory.cpp
    flang/runtime/assign-impl.h
    flang/runtime/assign.cpp
    flang/test/Fir/CUDA/cuda-data-transfer.fir

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h
index 2bb083b0dd75cb..713bdf536aaf90 100644

--- a/flang/include/flang/Runtime/CUDA/memory.h
+++ b/flang/include/flang/Runtime/CUDA/memory.h
@@ -44,10 +44,6 @@ void RTDECL(CUFDataTransferPtrDesc)(void *dst, Descriptor *src,
 void RTDECL(CUFDataTransferDescDesc)(Descriptor *dst, Descriptor *src,
     unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0);
 
-/// Data transfer from a scalar descriptor to a descriptor.
-void RTDECL(CUFDataTransferCstDesc)(Descriptor *dst, Descriptor *src,
-    unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0);
-
 /// Data transfer from a descriptor to a descriptor.
 void RTDECL(CUFDataTransferDescDescNoRealloc)(Descriptor *dst, Descriptor *src,
     unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0);

diff  --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9de20f0f0d45e1..ec7f67dff763b4 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -563,9 +563,8 @@ struct CUFDataTransferOpConversion
         // until we have more infrastructure.
         mlir::Value src = emboxSrc(rewriter, op, symtab);
         mlir::Value dst = emboxDst(rewriter, op, symtab);
-        mlir::func::FuncOp func =
-            fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferCstDesc)>(
-                loc, builder);
+        mlir::func::FuncOp func = fir::runtime::getRuntimeFunc<mkRTKey(
+            CUFDataTransferDescDescNoRealloc)>(loc, builder);
         auto fTy = func.getFunctionType();
         mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
         mlir::Value sourceLine =
@@ -649,9 +648,6 @@ struct CUFDataTransferOpConversion
       mlir::Value src = op.getSrc();
       if (!mlir::isa<fir::BaseBoxType>(srcTy)) {
         src = emboxSrc(rewriter, op, symtab);
-        if (fir::isa_trivial(srcTy))
-          func = fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferCstDesc)>(
-              loc, builder);
       }
       auto materializeBoxIfNeeded = [&](mlir::Value val) -> mlir::Value {
         if (mlir::isa<fir::EmboxOp>(val.getDefiningOp())) {

diff  --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
index 68963c4d7738ac..7b40b837e7666e 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang/runtime/CUDA/memory.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/memory.h"
-#include "../assign-impl.h"
 #include "../terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
@@ -121,24 +120,6 @@ void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc,
       *dstDesc, *srcDesc, terminator, MaybeReallocate, memmoveFct);
 }
 
-void RTDECL(CUFDataTransferCstDesc)(Descriptor *dstDesc, Descriptor *srcDesc,
-    unsigned mode, const char *sourceFile, int sourceLine) {
-  MemmoveFct memmoveFct;
-  Terminator terminator{sourceFile, sourceLine};
-  if (mode == kHostToDevice) {
-    memmoveFct = &MemmoveHostToDevice;
-  } else if (mode == kDeviceToHost) {
-    memmoveFct = &MemmoveDeviceToHost;
-  } else if (mode == kDeviceToDevice) {
-    memmoveFct = &MemmoveDeviceToDevice;
-  } else {
-    terminator.Crash("host to host copy not supported");
-  }
-
-  Fortran::runtime::DoFromSourceAssign(
-      *dstDesc, *srcDesc, terminator, memmoveFct);
-}
-
 void RTDECL(CUFDataTransferDescDescNoRealloc)(Descriptor *dstDesc,
     Descriptor *srcDesc, unsigned mode, const char *sourceFile,
     int sourceLine) {

diff  --git a/flang/runtime/assign-impl.h b/flang/runtime/assign-impl.h
index 5db0bc81510bff..f07a501d1d1263 100644
--- a/flang/runtime/assign-impl.h
+++ b/flang/runtime/assign-impl.h
@@ -9,29 +9,16 @@
 #ifndef FORTRAN_RUNTIME_ASSIGN_IMPL_H_
 #define FORTRAN_RUNTIME_ASSIGN_IMPL_H_
 
-#include "flang/Runtime/freestanding-tools.h"
-
 namespace Fortran::runtime {
 class Descriptor;
 class Terminator;
 
-using MemmoveFct = void *(*)(void *, const void *, std::size_t);
-
 // Assign one object to another via allocate statement from source specifier.
 // Note that if allocate object and source expression have the same rank, the
 // value of the allocate object becomes the value provided; otherwise the value
 // of each element of allocate object becomes the value provided (9.7.1.2(7)).
-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MemmoveWrapper(
-    void *dest, const void *src, std::size_t count) {
-  return Fortran::runtime::memmove(dest, src, count);
-}
-RT_API_ATTRS void DoFromSourceAssign(Descriptor &, const Descriptor &,
-    Terminator &, MemmoveFct memmoveFct = &MemmoveWrapper);
-#else
-RT_API_ATTRS void DoFromSourceAssign(Descriptor &, const Descriptor &,
-    Terminator &, MemmoveFct memmoveFct = &Fortran::runtime::memmove);
-#endif
+RT_API_ATTRS void DoFromSourceAssign(
+    Descriptor &, const Descriptor &, Terminator &);
 
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_ASSIGN_IMPL_H_

diff  --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp
index 8f0efaa376c198..83c0b9c70ed0d1 100644
--- a/flang/runtime/assign.cpp
+++ b/flang/runtime/assign.cpp
@@ -509,8 +509,8 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
-RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
-    const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
+RT_API_ATTRS void DoFromSourceAssign(
+    Descriptor &alloc, const Descriptor &source, Terminator &terminator) {
   if (alloc.rank() > 0 && source.rank() == 0) {
     // The value of each element of allocate object becomes the value of source.
     DescriptorAddendum *allocAddendum{alloc.Addendum()};
@@ -523,17 +523,17 @@ RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
            alloc.IncrementSubscripts(allocAt)) {
         Descriptor allocElement{*Descriptor::Create(*allocDerived,
             reinterpret_cast<void *>(alloc.Element<char>(allocAt)), 0)};
-        Assign(allocElement, source, terminator, NoAssignFlags, memmoveFct);
+        Assign(allocElement, source, terminator, NoAssignFlags);
       }
     } else { // intrinsic type
       for (std::size_t n{alloc.Elements()}; n-- > 0;
            alloc.IncrementSubscripts(allocAt)) {
-        memmoveFct(alloc.Element<char>(allocAt), source.raw().base_addr,
-            alloc.ElementBytes());
+        Fortran::runtime::memmove(alloc.Element<char>(allocAt),
+            source.raw().base_addr, alloc.ElementBytes());
       }
     }
   } else {
-    Assign(alloc, source, terminator, NoAssignFlags, memmoveFct);
+    Assign(alloc, source, terminator, NoAssignFlags);
   }
 }
 

diff  --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index 1ee44f3c6d97c9..3209197e118d19 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -38,7 +38,7 @@ func.func @_QPsub2() {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<i32>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[TEMP_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<i32>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
 func.func @_QPsub3() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub3Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -58,7 +58,7 @@ func.func @_QPsub3() {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<i32>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[V_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<i32>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
   
 func.func @_QPsub4() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -297,7 +297,7 @@ func.func @_QPscalar_to_array() {
 }
 
 // CHECK-LABEL: func.func @_QPscalar_to_array()
-// CHECK: _FortranACUFDataTransferCstDesc
+// CHECK: _FortranACUFDataTransferDescDescNoRealloc
 
 func.func @_QPtest_type() {
   %0 = cuf.alloc !fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_typeEa"} -> !fir.ref<!fir.type<_QMbarTcmplx{id:i32,c:complex<f32>}>>
@@ -344,7 +344,7 @@ func.func @_QPshape_shift() {
 }
 
 // CHECK-LABEL: func.func @_QPshape_shift()
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc
+// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc
 
 func.func @_QPshape_shift2() {
   %c11 = arith.constant 11 : index
@@ -383,7 +383,7 @@ func.func @_QPdevice_addr_conv() {
 // CHECK: %[[DEV_ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 // CHECK: %[[DEV_ADDR_CONV:.*]] = fir.convert %[[DEV_ADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.array<4xf32>>
 // CHECK: fir.embox %[[DEV_ADDR_CONV]](%{{.*}}) : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<4xf32>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc
+// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc
 
 func.func @_QQchar_transfer() attributes {fir.bindc_name = "char_transfer"} {
   %c1 = arith.constant 1 : index
@@ -464,6 +464,6 @@ func.func @_QPlogical_cst() {
 // CHECK: %[[EMBOX:.*]] = fir.embox %[[CONST]] : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
 // CHECK: fir.store %[[EMBOX]] to %[[DESC]] : !fir.ref<!fir.box<!fir.logical<4>>>
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<4>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
 } // end of module