[flang-commits] [flang] [llvm] [flang] Inline scalar-to-scalar TRANSFER for same-size trivial types (PR #191589)

Zhen Wang via flang-commits flang-commits at lists.llvm.org
Thu Apr 16 11:43:46 PDT 2026


https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/191589

>From c05e74be71f9779d51868842272f20c793282057 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 10 Apr 2026 19:46:00 -0700
Subject: [PATCH 1/6] Inline scalar-to-scalar TRANSFER for same-size trivial
 types

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 27 ++++++++-
 flang/test/Lower/Intrinsics/transfer.f90      | 57 ++++++++++++++++---
 2 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d6dee88f422e0..6b040fd342ad1 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8680,6 +8680,31 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
 
   assert(args.size() >= 2); // args.size() == 2 when size argument is omitted.
 
+  bool absentSize = (args.size() == 2);
+
+  // Inline scalar-to-scalar transfers when the result is a trivial type
+  // (integer, real, etc.) and both source and result have the same storage
+  // size.
+  if (absentSize && fir::isa_trivial(resultType)) {
+    mlir::Value sourceBase = fir::getBase(args[0]);
+    mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
+    if (fir::isa_ref_type(sourceBase.getType()) &&
+        !mlir::isa<fir::SequenceType>(sourceType)) {
+      auto &dl = builder.getDataLayout();
+      auto &kindMap = builder.getKindMap();
+      auto sourceSizeAndAlign =
+          fir::getTypeSizeAndAlignment(loc, sourceType, dl, kindMap);
+      auto resultSizeAndAlign =
+          fir::getTypeSizeAndAlignment(loc, resultType, dl, kindMap);
+      if (sourceSizeAndAlign && resultSizeAndAlign &&
+          sourceSizeAndAlign->first == resultSizeAndAlign->first) {
+        auto refTy = builder.getRefType(resultType);
+        auto cast = builder.createConvert(loc, refTy, sourceBase);
+        return fir::LoadOp::create(builder, loc, cast);
+      }
+    }
+  }
+
   // Handle source argument
   mlir::Value source = builder.createBox(loc, args[0]);
 
@@ -8688,8 +8713,6 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
   fir::BoxValue moldTmp = mold;
   unsigned moldRank = moldTmp.rank();
 
-  bool absentSize = (args.size() == 2);
-
   // Create mutable fir.box to be passed to the runtime for the result.
   mlir::Type type = (moldRank == 0 && absentSize)
                         ? resultType
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index 6a9ea14570fb3..e2c7b51143a17 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -3,17 +3,12 @@
 subroutine trans_test(store, word)
     ! CHECK-LABEL: func @_QPtrans_test(
     ! CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<i32>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f32>{{.*}}) {
-    ! CHECK-DAG:     %[[RESULT_BOX:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
     ! CHECK-DAG:     %[[store:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}{uniq_name = "_QFtrans_testEstore"}
     ! CHECK-DAG:     %[[word:.*]]:2 = hlfir.declare %[[VAL_1]] {{.*}}{uniq_name = "_QFtrans_testEword"}
-    ! CHECK:         %[[VAL_3:.*]] = fir.embox %[[word]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-    ! CHECK:         %[[VAL_4:.*]] = fir.embox %[[store]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-    ! CHECK:         fir.call @_FortranATransfer({{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-    ! CHECK:         %[[LOADED:.*]] = fir.load %[[RESULT_BOX]] : !fir.ref<!fir.box<!fir.heap<i32>>>
-    ! CHECK:         %[[ADDR:.*]] = fir.box_addr %[[LOADED]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
-    ! CHECK:         %[[VAL:.*]] = fir.load %[[ADDR]] : !fir.heap<i32>
-    ! CHECK:         fir.freemem %[[ADDR]]
+    ! CHECK:         %[[LOADED:.*]] = fir.load %[[word]]#0 : !fir.ref<f32>
+    ! CHECK:         %[[VAL:.*]] = arith.bitcast %[[LOADED]] : f32 to i32
     ! CHECK:         hlfir.assign %[[VAL]] to %[[store]]#0 : i32, !fir.ref<i32>
+    ! CHECK-NOT:     fir.call @_FortranATransfer
     ! CHECK:         return
     ! CHECK:       }
     integer :: store
@@ -54,3 +49,49 @@ integer function trans_test3(p)
     t = transfer(p, t)
     trans_test3 = t%x
   end function
+
+  ! Scalar same-size transfer (f64 -> i64) is inlined as fir.load + arith.bitcast.
+  subroutine trans_test_r8_to_i8(store, word)
+    ! CHECK-LABEL: func @_QPtrans_test_r8_to_i8(
+    ! CHECK-SAME:    %[[RES:.*]]: !fir.ref<i64>{{.*}}, %[[SRC:.*]]: !fir.ref<f64>{{.*}}) {
+    ! CHECK-DAG:     %[[store:.*]]:2 = hlfir.declare %[[RES]] {{.*}}{uniq_name = "_QFtrans_test_r8_to_i8Estore"}
+    ! CHECK-DAG:     %[[word:.*]]:2 = hlfir.declare %[[SRC]] {{.*}}{uniq_name = "_QFtrans_test_r8_to_i8Eword"}
+    ! CHECK:         %[[LOADED:.*]] = fir.load %[[word]]#0 : !fir.ref<f64>
+    ! CHECK:         %[[VAL:.*]] = arith.bitcast %[[LOADED]] : f64 to i64
+    ! CHECK:         hlfir.assign %[[VAL]] to %[[store]]#0 : i64, !fir.ref<i64>
+    ! CHECK-NOT:     fir.call @_FortranATransfer
+    ! CHECK:         return
+    ! CHECK:       }
+    integer(8) :: store
+    real(8) :: word
+    store = transfer(word, store)
+  end subroutine
+
+  ! BIND(C) struct (c_ptr) to integer(8): same byte size, inlined via
+  ! address-level reinterpret. Covers the c_devptr pattern on CUDA device code.
+  subroutine trans_test_cptr_to_i8(store, src)
+    ! CHECK-LABEL: func @_QPtrans_test_cptr_to_i8(
+    ! CHECK:         %[[srcDecl:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtrans_test_cptr_to_i8Esrc"}
+    ! CHECK:         %[[storeDecl:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtrans_test_cptr_to_i8Estore"}
+    ! CHECK:         %[[CAST:.*]] = fir.convert %[[srcDecl]]#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
+    ! CHECK:         %[[VAL:.*]] = fir.load %[[CAST]] : !fir.ref<i64>
+    ! CHECK:         hlfir.assign %[[VAL]] to %[[storeDecl]]#0 : i64, !fir.ref<i64>
+    ! CHECK-NOT:     fir.call @_FortranATransfer
+    ! CHECK:         return
+    ! CHECK:       }
+    use iso_c_binding
+    integer(8) :: store
+    type(c_ptr) :: src
+    store = transfer(src, store)
+  end subroutine
+
+  ! Different-size scalar transfer (i32 -> i64) falls back to runtime.
+  subroutine trans_test_diff_size(store, src)
+    ! CHECK-LABEL: func @_QPtrans_test_diff_size(
+    ! CHECK:         fir.call @_FortranATransfer(
+    ! CHECK:         return
+    ! CHECK:       }
+    integer(8) :: store
+    integer(4) :: src
+    store = transfer(src, store)
+  end subroutine

>From d2747e1a20f91d8d9a7c94df0c1e17b911e08b16 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 10 Apr 2026 19:56:07 -0700
Subject: [PATCH 2/6] code reorg

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6b040fd342ad1..fd67ef93e752f 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8690,16 +8690,14 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
     mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
     if (fir::isa_ref_type(sourceBase.getType()) &&
         !mlir::isa<fir::SequenceType>(sourceType)) {
-      auto &dl = builder.getDataLayout();
-      auto &kindMap = builder.getKindMap();
-      auto sourceSizeAndAlign =
-          fir::getTypeSizeAndAlignment(loc, sourceType, dl, kindMap);
-      auto resultSizeAndAlign =
-          fir::getTypeSizeAndAlignment(loc, resultType, dl, kindMap);
+      auto sourceSizeAndAlign = fir::getTypeSizeAndAlignment(
+          loc, sourceType, builder.getDataLayout(), builder.getKindMap());
+      auto resultSizeAndAlign = fir::getTypeSizeAndAlignment(
+          loc, resultType, builder.getDataLayout(), builder.getKindMap());
       if (sourceSizeAndAlign && resultSizeAndAlign &&
           sourceSizeAndAlign->first == resultSizeAndAlign->first) {
-        auto refTy = builder.getRefType(resultType);
-        auto cast = builder.createConvert(loc, refTy, sourceBase);
+        mlir::Type refTy = builder.getRefType(resultType);
+        mlir::Value cast = builder.createConvert(loc, refTy, sourceBase);
         return fir::LoadOp::create(builder, loc, cast);
       }
     }

>From b2e49372e1705da5bbeddf8be1b56fd1cf6233d0 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Mon, 13 Apr 2026 11:28:33 -0700
Subject: [PATCH 3/6] Use arith.bitcast for trivial-to-trivial transfer instead
 of address-level type punning

For scalar transfers where both source and result are integer or float types, emit fir.load + arith.bitcast instead of fir.convert on the ref type + fir.load. This produces cleaner value-level IR that directly expresses bit reinterpretation semantics. The address-level approach is retained for non-trivial source types (e.g., c_ptr to integer).
---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index fd67ef93e752f..7d744e49c0695 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8696,6 +8696,14 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
           loc, resultType, builder.getDataLayout(), builder.getKindMap());
       if (sourceSizeAndAlign && resultSizeAndAlign &&
           sourceSizeAndAlign->first == resultSizeAndAlign->first) {
+        if (mlir::isa<mlir::IntegerType, mlir::FloatType>(sourceType) &&
+            mlir::isa<mlir::IntegerType, mlir::FloatType>(resultType)) {
+          mlir::Value val = fir::LoadOp::create(builder, loc, sourceBase);
+          if (sourceType != resultType)
+            val =
+                mlir::arith::BitcastOp::create(builder, loc, resultType, val);
+          return val;
+        }
         mlir::Type refTy = builder.getRefType(resultType);
         mlir::Value cast = builder.createConvert(loc, refTy, sourceBase);
         return fir::LoadOp::create(builder, loc, cast);

>From ba390d3535006198c572fa86d4fb90eb4fb865af Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Mon, 13 Apr 2026 12:02:35 -0700
Subject: [PATCH 4/6] format

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 7d744e49c0695..055d4567b6b5b 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8700,8 +8700,7 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
             mlir::isa<mlir::IntegerType, mlir::FloatType>(resultType)) {
           mlir::Value val = fir::LoadOp::create(builder, loc, sourceBase);
           if (sourceType != resultType)
-            val =
-                mlir::arith::BitcastOp::create(builder, loc, resultType, val);
+            val = mlir::arith::BitcastOp::create(builder, loc, resultType, val);
           return val;
         }
         mlir::Type refTy = builder.getRefType(resultType);

>From 850130ee3c578a04973f08ee6ccba20730c65ab9 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Mon, 13 Apr 2026 19:34:05 -0700
Subject: [PATCH 5/6] Fix inline transfer to skip array mold; add test

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  4 +++-
 flang/test/Lower/Intrinsics/transfer.f90      | 12 ++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 055d4567b6b5b..8c76d5fa98f71 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8688,8 +8688,10 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
   if (absentSize && fir::isa_trivial(resultType)) {
     mlir::Value sourceBase = fir::getBase(args[0]);
     mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
+    mlir::Type moldType = fir::unwrapRefType(fir::getBase(args[1]).getType());
     if (fir::isa_ref_type(sourceBase.getType()) &&
-        !mlir::isa<fir::SequenceType>(sourceType)) {
+        !mlir::isa<fir::SequenceType>(sourceType) &&
+        !mlir::isa<fir::SequenceType>(moldType)) {
       auto sourceSizeAndAlign = fir::getTypeSizeAndAlignment(
           loc, sourceType, builder.getDataLayout(), builder.getKindMap());
       auto resultSizeAndAlign = fir::getTypeSizeAndAlignment(
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index e2c7b51143a17..f77d9c6846612 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -95,3 +95,15 @@ subroutine trans_test_diff_size(store, src)
     integer(4) :: src
     store = transfer(src, store)
   end subroutine
+
+  ! Array mold without SIZE: result is rank-1 array, must use runtime.
+  subroutine trans_test_array_mold(src, result)
+    ! CHECK-LABEL: func @_QPtrans_test_array_mold(
+    ! CHECK:         fir.call @_FortranATransfer(
+    ! CHECK:         return
+    ! CHECK:       }
+    real :: src
+    integer, allocatable :: result(:)
+    integer :: mold(4)
+    result = transfer(src, mold)
+  end subroutine

>From be7538c9e17df951f7fb6c17c5708e906140105a Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 16 Apr 2026 11:42:59 -0700
Subject: [PATCH 6/6] add test, address comment

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |    4 +-
 flang/test/Lower/CUDA/cuda-gpu-unified.cuf    |  153 ++
 flang/test/Lower/Intrinsics/transfer.f90      |   44 +
 repro.o                                       |  Bin 0 -> 7544 bytes
 repro_mod.mod                                 | 1855 +++++++++++++++++
 5 files changed, 2054 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-gpu-unified.cuf
 create mode 100644 repro.o
 create mode 100644 repro_mod.mod

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 8c76d5fa98f71..d0a2bd2eaef21 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -8690,8 +8690,8 @@ IntrinsicLibrary::genTransfer(mlir::Type resultType,
     mlir::Type sourceType = fir::unwrapRefType(sourceBase.getType());
     mlir::Type moldType = fir::unwrapRefType(fir::getBase(args[1]).getType());
     if (fir::isa_ref_type(sourceBase.getType()) &&
-        !mlir::isa<fir::SequenceType>(sourceType) &&
-        !mlir::isa<fir::SequenceType>(moldType)) {
+        (fir::isa_trivial(sourceType) || mlir::isa<fir::RecordType>(sourceType)) &&
+        fir::isa_trivial(moldType)) {
       auto sourceSizeAndAlign = fir::getTypeSizeAndAlignment(
           loc, sourceType, builder.getDataLayout(), builder.getKindMap());
       auto resultSizeAndAlign = fir::getTypeSizeAndAlignment(
diff --git a/flang/test/Lower/CUDA/cuda-gpu-unified.cuf b/flang/test/Lower/CUDA/cuda-gpu-unified.cuf
new file mode 100644
index 0000000000000..f8d9638c96154
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-gpu-unified.cuf
@@ -0,0 +1,153 @@
+! RUN: bbc -emit-hlfir -fcuda -gpu=unified %s -o - | FileCheck %s
+
+! Test -gpu=unified flag: allocatable arrays without explicit CUDA attributes
+! should be implicitly treated as unified.
+
+! -----------------------------------------------------------------------------
+! Test 1: Basic allocatable without explicit attribute becomes unified
+! -----------------------------------------------------------------------------
+subroutine test_implicit_unified()
+  real, allocatable :: a(:)
+  allocate(a(100))
+  a = 1.0
+  deallocate(a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_implicit_unified()
+! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<unified>, uniq_name = "_QFtest_implicit_unifiedEa"}
+! CHECK: fir.embox {{.*}} {allocator_idx = 4 : i32}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_implicit_unifiedEa"}
+! CHECK: cuf.allocate %[[BOX_DECL]]#0 : {{.*}} {data_attr = #cuf.cuda<unified>}
+! CHECK: cuf.deallocate %[[BOX_DECL]]#0 : {{.*}} {data_attr = #cuf.cuda<unified>}
+! CHECK: cuf.free %[[BOX_DECL]]#0 : {{.*}} {data_attr = #cuf.cuda<unified>}
+
+! -----------------------------------------------------------------------------
+! Test 2: Explicit device attribute is preserved (not overridden to unified)
+! -----------------------------------------------------------------------------
+subroutine test_explicit_device()
+  real, allocatable, device :: d(:)
+  allocate(d(100))
+  deallocate(d)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_explicit_device()
+! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_explicit_deviceEd"}
+! CHECK: fir.embox {{.*}} {allocator_idx = 2 : i32}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_deviceEd"}
+! CHECK: cuf.allocate %[[BOX_DECL]]#0 : {{.*}} {data_attr = #cuf.cuda<device>}
+
+! -----------------------------------------------------------------------------
+! Test 3: Explicit pinned attribute is preserved
+! -----------------------------------------------------------------------------
+subroutine test_explicit_pinned()
+  real, allocatable, pinned :: p(:)
+  allocate(p(100))
+  deallocate(p)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_explicit_pinned()
+! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "p", data_attr = #cuf.cuda<pinned>, uniq_name = "_QFtest_explicit_pinnedEp"}
+! CHECK: fir.embox {{.*}} {allocator_idx = 1 : i32}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_pinnedEp"}
+
+! -----------------------------------------------------------------------------
+! Test 4: Explicit managed attribute is preserved (not overridden to unified)
+! -----------------------------------------------------------------------------
+subroutine test_explicit_managed()
+  real, allocatable, managed :: m(:)
+  allocate(m(100))
+  deallocate(m)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_explicit_managed()
+! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "m", data_attr = #cuf.cuda<managed>, uniq_name = "_QFtest_explicit_managedEm"}
+! CHECK: fir.embox {{.*}} {allocator_idx = 3 : i32}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_managedEm"}
+
+! -----------------------------------------------------------------------------
+! Test 5: Pointer variables are NOT affected by -gpu=unified
+! -----------------------------------------------------------------------------
+subroutine test_pointer_not_unified()
+  real, pointer :: ptr(:)
+  allocate(ptr(100))
+  ptr = 1.0
+  deallocate(ptr)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_pointer_not_unified()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> {bindc_name = "ptr", uniq_name = "_QFtest_pointer_not_unifiedEptr"}
+! CHECK-NOT: data_attr = #cuf.cuda<unified>
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_pointer_not_unifiedEptr"}
+! CHECK: fir.call @_FortranAPointerAllocate
+
+! -----------------------------------------------------------------------------
+! Test 6: Multiple allocatables - mix of implicit and explicit
+! -----------------------------------------------------------------------------
+subroutine test_mixed_allocatables()
+  real, allocatable :: a(:)           ! Should become unified
+  real, allocatable, device :: d(:)   ! Should stay device
+  real, allocatable, pinned :: p(:)   ! Should stay pinned
+  real, allocatable, managed :: m(:)  ! Should stay managed (explicit)
+
+  allocate(a(10), d(10), p(10), m(10))
+  deallocate(a, d, p, m)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_mixed_allocatables()
+! CHECK: cuf.alloc {{.*}} {bindc_name = "a", data_attr = #cuf.cuda<unified>, uniq_name = "_QFtest_mixed_allocatablesEa"}
+! CHECK: cuf.alloc {{.*}} {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_mixed_allocatablesEd"}
+! CHECK: cuf.alloc {{.*}} {bindc_name = "m", data_attr = #cuf.cuda<managed>, uniq_name = "_QFtest_mixed_allocatablesEm"}
+! CHECK: cuf.alloc {{.*}} {bindc_name = "p", data_attr = #cuf.cuda<pinned>, uniq_name = "_QFtest_mixed_allocatablesEp"}
+
+! -----------------------------------------------------------------------------
+! Test 7: Multi-dimensional allocatable array
+! -----------------------------------------------------------------------------
+subroutine test_multidim()
+  real, allocatable :: arr(:,:,:)
+  allocate(arr(10,20,30))
+  arr = 0.0
+  deallocate(arr)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_multidim()
+! CHECK: cuf.alloc {{.*}} {bindc_name = "arr", data_attr = #cuf.cuda<unified>, uniq_name = "_QFtest_multidimEarr"}
+! CHECK: fir.embox {{.*}} {allocator_idx = 4 : i32}
+
+! -----------------------------------------------------------------------------
+! Test 8: Dummy arguments - allocatable dummy without explicit attribute
+! -----------------------------------------------------------------------------
+subroutine test_dummy_allocatable(arr)
+  real, allocatable, intent(inout) :: arr(:)
+  if (.not. allocated(arr)) allocate(arr(100))
+  arr = 1.0
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_dummy_allocatable(
+! CHECK-SAME: %{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuf.data_attr = #cuf.cuda<unified>, fir.bindc_name = "arr"})
+! CHECK: hlfir.declare {{.*}} {data_attr = #cuf.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable, intent_inout>
+
+! -----------------------------------------------------------------------------
+! Test 9: Module variables - allocatable module variable becomes unified
+! -----------------------------------------------------------------------------
+module mod_globals
+  real, allocatable :: global_arr(:)
+  real, allocatable, device :: global_device(:)
+end module
+
+subroutine test_module_var()
+  use mod_globals
+  allocate(global_arr(50))
+  allocate(global_device(50))
+  deallocate(global_arr)
+  deallocate(global_device)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_module_var()
+! CHECK: cuf.allocate {{.*}} {data_attr = #cuf.cuda<unified>, hasDoubleDescriptor}
+! CHECK: cuf.allocate {{.*}} {data_attr = #cuf.cuda<device>, hasDoubleDescriptor}
+
+! CHECK: fir.global @_QMmod_globalsEglobal_arr {data_attr = #cuf.cuda<unified>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: fir.embox {{.*}} {allocator_idx = 4 : i32}
+
+! CHECK: fir.global @_QMmod_globalsEglobal_device {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: fir.embox {{.*}} {allocator_idx = 2 : i32}
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index f77d9c6846612..7afdfd28c2ae1 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -107,3 +107,47 @@ subroutine trans_test_array_mold(src, result)
     integer :: mold(4)
     result = transfer(src, mold)
   end subroutine
+
+  ! Allocatable mold: must use runtime.
+  subroutine trans_test_alloc_mold(src, result)
+    ! CHECK-LABEL: func @_QPtrans_test_alloc_mold(
+    ! CHECK:         fir.call @_FortranATransfer(
+    ! CHECK:         return
+    ! CHECK:       }
+    real :: src
+    integer, allocatable :: mold(:)
+    integer, allocatable :: result(:)
+    result = transfer(src, mold)
+  end subroutine
+
+  ! POINTER source: descriptor is unpacked before reaching genTransfer,
+  ! so the inline optimization applies.
+  subroutine trans_test_pointer_source(store, src)
+    ! CHECK-LABEL: func @_QPtrans_test_pointer_source(
+    ! CHECK:         fir.load {{.*}} : !fir.ref<!fir.box<!fir.ptr<f32>>>
+    ! CHECK:         fir.box_addr
+    ! CHECK:         %[[VAL:.*]] = fir.load {{.*}} : !fir.ptr<f32>
+    ! CHECK:         arith.bitcast %[[VAL]] : f32 to i32
+    ! CHECK-NOT:     fir.call @_FortranATransfer
+    ! CHECK:         return
+    ! CHECK:       }
+    integer :: store
+    real, pointer :: src
+    store = transfer(src, store)
+  end subroutine
+
+  ! ALLOCATABLE source: descriptor is unpacked before reaching genTransfer,
+  ! so the inline optimization applies.
+  subroutine trans_test_alloc_source(store, src)
+    ! CHECK-LABEL: func @_QPtrans_test_alloc_source(
+    ! CHECK:         fir.load {{.*}} : !fir.ref<!fir.box<!fir.heap<f32>>>
+    ! CHECK:         fir.box_addr
+    ! CHECK:         %[[VAL:.*]] = fir.load {{.*}} : !fir.heap<f32>
+    ! CHECK:         arith.bitcast %[[VAL]] : f32 to i32
+    ! CHECK-NOT:     fir.call @_FortranATransfer
+    ! CHECK:         return
+    ! CHECK:       }
+    integer :: store
+    real, allocatable :: src
+    store = transfer(src, store)
+  end subroutine
diff --git a/repro.o b/repro.o
new file mode 100644
index 0000000000000000000000000000000000000000..1e95a615d5a6650bdf4b1e52c98b9f458d7a505b
GIT binary patch
literal 7544
zcmdT}eQX at X6@PQLcfK5s&(1fc!5b2UlN{^a`|=&;;~YE2QH^5~+(sDpavy8^;C$!a
ztzA1I#DrFDMb*+4stTzBwN=F*Eo!BzqF*Ug5Yeb=Q~si#ibDPfEv*Ow6jij>_hx5&
z>-9MW^)E)ZJMaD8$Gn+&JF`1?a{ut4hcPdR!r*pbl5x~mu5B=BPNty^HAgQIGxX?f
zR2_Mp?kmdZKgNE_pdCiH*suo}OC$Mm#|kxQL0|VeAeVSqa?*}B+9D7Jr{UpcX at eMQ
zv*~+%_d=SJcD$g~rfXK!%I1)g_2P`p^Ey+TE*BO+E6;1i@??b*Q>98STSC5Qfo7Qt
zmX^8uuBtg(tz>2 at g;C3_AubG>iK=F4O*52b*Ay1a+}u=VvYMSSK_f5N$ckXNbrVo?
zTV_`xXp+yCN>kPB>@*J@`(jnB#aXj4hVPr&IbQ<2N*{c1Gnui0dol+H2M_PR2ap+b
zG5vib2S*O{_YIQ}HCOP1++;u+?jIQtT13%ERn3}Noi_`IXUrL`GC5f at Eq&E|I%jhK
zk at AuGYzbCk`6IRB%xuZb at Zni0D_=!cuvQ4yoMeoYGVMd3IL?u=pNY(1rD|2P<-XBM
zv22;uzFMu4FJ>+CFeYENdX)t0McEFAJRs<yHSn=Da7yQ9e&*J|kFSB#8q>_rqif(#
zxNrq|D$nBOh4q|v-cBQPv$`Ih^Vq0dkY5q_vM#f!e+v9(LjM|}5JWlok&F0qLjNYL
zz0kVsMiO7+xQtGs0B>+yuCfjMUmTaS(ZB;3#|DT0CDDP8aJ-=>sXCUhV2GyT1?9oV
zmzKI(AI2L5>JKk<1&9m~7<hs}Aievf6IU(URH!@LiLOwOePqfw>LXVk<@1R8iR7T4
z<P9V4Rp}1EMx?gkrR=n65-1%utrOj&@V&=7igZYUO at t*BkfTGfk`KbE{0Die|1IkG
zgdB&I>-eyKlNc-?!%Nx`e_r4zq{%0Uc|Rxcqzj)V9W10TwnEMy<#zzmcj5vD8akOv
zgk2K;`H)H_tkVwMna%eIoda0&I0Zi}Y6{zl4><64L0_$EUtsJUj$8eFy#+3Yz*AiZ
zG;R>Wi}L~A2hLvz!RKngHdCSn(EBBu--LO@`3nqA9arjb3i%*jxi&bpAZHJGv_G}5
z4hd0^k6WQS>|v}<{YQS6)mo>qsl%6Ah^O}WBnPp*3qF*u=PZ}MNcwH{gMWyjjzeEX
z2U7o`V4WgBd5HS6>!EAv6anXj|5j2U{YyUHUK<UZgAY>D3V&y-ehLo@&i0GlaJJgQ
zhl}$UL(17IPkdXB?VYXq8uC{{u(a3`;2y4RZ-r?e^8NsKNPNx*KRJdR#X}d`w?C*b
zJ|5>UhiE*woH{`AGbH!f<8|?5fX~;0$Z|e^o#0v6jEY!@|LsBOU*sI#p7+AvLY_w!
z6+82&f8WCSI__`G-wkazzBp8{bcY;Y?4RHEfc?MNN&ORx8uEV(!DkokbWx~3m-^t5
z#Z3&l(1P-%s}1?h3}pW7@&M(Zj$)pF7rJ3|Q9h>Uz3 at gS+`0H3&~sg{KjMYwL$GZ*
ze(J)9pSQ=7_7b~LE<+as<`HQoaIri{_2pXbN8Apdqh*1Mpp>RUdJOg4ic)TCt-eb+
z*?X>H at AkXeQ*%TW at LKUnd2{Q=8q3E4$8Y860leRFqQ{B~j{`S<cYmbZ)>?iou=QJ3
z>$~lxL)lNaA1Rl(yQW8pinMq8mvUQc_1!s-A2<&%ZDf1+R_}a?U)|6LM98mv2Arm8
z!R(X`3*N=@J+_(s<3jNkvA;{-ZhP88T`FAloN#(Y7|8ur<^gRDRG!7l?UV4A#p?ew
zF^=Na-EGTe9V(oh7OU+pA&`A^=ePLQpG~_Qd{Nkc)L~EWaBh1{U5bBI`_Bk_nOE-i
z<v8G~PUE-A{+zIvdAFARS42!{5vQ!q3;P8DkUhmO+m+-0JH)6 at X}p}}h}fNm1W5Mu
ze&x1*2eD at MPdBkAJ-7Wn#Ar at MxJZd26qiKDsiIt45@K0JkU-U0p(bHi5-*QuDDfOc
zwl5)EdDtj2SA>Iiu}&k;(CdnGAQeGJ`LU~iQltmFyS?z{^+9dktk#N^vYOP4pk`&O
zQ>LZXW-=)~sAUTUJQAp7YQ<xwnurArsE0K?%`vNGRhyhE=hgr5!5ZueYO`59jZw8+
zBBsImj?7-TBcq4opw?4HaCfjbsD0YZKA?(I8db}dic at 9PK!c>}{6H2*4dc-X^)~gm
zVI;$9swb=(dh)~oDCwlC&AkBY_QPei;`m`a8X18vPnl)2TFiH0`d;3&JFG_W7m3H>
z at mM0FCn7N;9#$iXNFt6Nb#&^7w}V*6VtDI`M3eDI46kS;7EkHX>jCz_ri2PVibs-0
zGGauHRH_ePJB0QCd#1-gAgRYv at jc+11FsiC3iE}Am^a{MI}PJ*21lVo$93azP>o0s
z+LJyojIeqFU(g_KM2zjwY0QByw8%WW3)V5VBOX7-+V%TbXbTHqGaWkm{0{TAj6wVE
z=OFZb2A#2|*v7ZLI4F at +B-Vq0Cyi7h7B!~XrsV7FTqKz?dg95RXcU9{GxL6xg`NW6
zy0RyOqY;UFJzno8VOw1B_(wb;%>x at _SD3HY6WYi8{v)2pa13I)kx0gW;n5S(#9x?t
zqIX0o6l=v?$?Q~GeBN<>fYGkBc7ozjzK%0{JgAi_`7_=P$ur)L%w~AYlPQ?96-ytF
z=@U5ZRded7(tG3K(GXla+zu~R3%44^?Y+U^Ol97Hy~E)#_*8E&KW}BVX<+Yo)2JHu
z!J*;aV5#t=q8x-F9JKM*z3X0r96H2g?FXY;hzvjqXK at R3MDZHl3Oj2So?c8qW&Fl!
zAv&=Ql*-=Z1kP>**uY*RG4TRxNhw=;-+}|xLM*(<S`Xr+G0%WcISo7F;aeb71CJ4j
zO<WJ1@$12JLo{**j+P2qE*j~A9f|N?q05-q3>$kk!)Nx!FqYbMDaSgJ<Ivu7+B?u2
ztW>u`M`|m0w!>(mH+Woq(%%yIv+puZ at 7)9)I7|MQ*oN^v;Q1AFMz(`z117;QQ`mhx
zbJXU_#b5i^N5X2?CtySP3ovTz?hT&!Hr!Myyv7b;rwqN7wMXB94r>Qv*D-hu-t-%K
zA|8!LBL=otH7y+16ZFaU!~WbLtz)aoWGP#oQsrV0iE6s0t6kHUHCwx-yL+l=P0!`D
zd}XG)RGPPMH{%iL{bt_MP^c#3x)Ie)Gn$Y0B(s?PJ<&-$X_|?AK9`N?JyAWMh-Y&t
zu at B0R0AR-s0b6Ki*R{3;9>c)s>qhQn{JXcWt^KYx_iEUP{_jXCaG8HIHc*@LHreqf
zQRH#p^4t1G#F=AtKPt}Of6j5Sh$1cbqmxL>JrBv{@-9*x at K!x7kRzg|F>j_XmwnoE
zXm7E<h3fnpmQ45HMXgJmKJ%!^Jxby>YVT3fCt-5rsQpGS08|oUAzi0Fo(RuQ6*F_$
zd_I$(D`YeHm=<fe;%6qaRwh?0XRC|A1qE}ym^bM>U&3_YZXeA9Wy*|M##4TA;69kO
z_7%&A%_)v%xNmXT*<UH+KMQktt5SvK{iJ_v at Q`S^uY~JJmP<3`c?_t8p<pC<GF`Du
z?aq<0n`>4!{{V)LVq2D}nNy-|b6Wl%;QIe!q*BJe3GNy`a0o3&KA!0x8|cdnAM6))
z6i-b{#PKQOBlZ8?{)Fc4y`Mau7YVh6Cdg4-EcBs_F!%dMTG+ at u7VN5AyA=hLGij+H
z>QT}Dkl^XviSj|_jm-CT*f_QC?t{#4sW03t+e at BKA)2-S4bgyXhuW99O|}yMCeqZt
z2UNT_%27ee=T%DF at r<Fsj$f53ynbFh=dv9sblcOrSF`wk&?J6(&ZNEcdnaPlzK)l(
zyeiti-w7g3x>RKSJqOJN;<j(AlWWk at P9ncY0<qZ@&pKj5!rmR1j9=RS28m|<f4NEj
d>6KR6%l^NHn7eKAy&&3ul^84(H|4n7{=aEJ0zUu%

literal 0
HcmV?d00001

diff --git a/repro_mod.mod b/repro_mod.mod
new file mode 100644
index 0000000000000..a98c702d7e01d
--- /dev/null
+++ b/repro_mod.mod
@@ -0,0 +1,1855 @@
+!mod$ v1 sum:ec95fad7fdca7ef6
+!need$ 1b6af6422890602e i cudafor
+!need$ 21f0f0adf5cb910f i __fortran_builtins
+!need$ c8dda17ea6314235 i __cuda_builtins
+!need$ 899a840e1af27140 i cudadevice
+module repro_mod
+use,intrinsic::cudafor,only:dim3
+use,intrinsic::cudafor,only:c_devptr
+use,intrinsic::cudafor,only:c_devloc
+use,intrinsic::cudafor,only:c_associated
+use,intrinsic::cudafor,only:c_funloc
+use,intrinsic::cudafor,only:c_funptr
+use,intrinsic::cudafor,only:c_f_pointer
+use,intrinsic::cudafor,only:c_f_strpointer
+use,intrinsic::cudafor,only:c_loc
+use,intrinsic::cudafor,only:c_null_funptr
+use,intrinsic::cudafor,only:c_null_ptr
+use,intrinsic::cudafor,only:c_ptr
+use,intrinsic::cudafor,only:c_sizeof
+use,intrinsic::cudafor,only:f_c_string
+use,intrinsic::cudafor,only:c_int8_t
+use,intrinsic::cudafor,only:c_int16_t
+use,intrinsic::cudafor,only:c_int32_t
+use,intrinsic::cudafor,only:c_int64_t
+use,intrinsic::cudafor,only:c_int128_t
+use,intrinsic::cudafor,only:c_int
+use,intrinsic::cudafor,only:c_short
+use,intrinsic::cudafor,only:c_long
+use,intrinsic::cudafor,only:c_long_long
+use,intrinsic::cudafor,only:c_signed_char
+use,intrinsic::cudafor,only:c_size_t
+use,intrinsic::cudafor,only:c_intmax_t
+use,intrinsic::cudafor,only:c_intptr_t
+use,intrinsic::cudafor,only:c_ptrdiff_t
+use,intrinsic::cudafor,only:c_int_least8_t
+use,intrinsic::cudafor,only:c_int_fast8_t
+use,intrinsic::cudafor,only:c_int_least16_t
+use,intrinsic::cudafor,only:c_int_fast16_t
+use,intrinsic::cudafor,only:c_int_least32_t
+use,intrinsic::cudafor,only:c_int_fast32_t
+use,intrinsic::cudafor,only:c_int_least64_t
+use,intrinsic::cudafor,only:c_int_fast64_t
+use,intrinsic::cudafor,only:c_int_least128_t
+use,intrinsic::cudafor,only:c_int_fast128_t
+use,intrinsic::cudafor,only:c_float
+use,intrinsic::cudafor,only:c_double
+use,intrinsic::cudafor,only:c_long_double
+use,intrinsic::cudafor,only:c_float_complex
+use,intrinsic::cudafor,only:c_double_complex
+use,intrinsic::cudafor,only:c_long_double_complex
+use,intrinsic::cudafor,only:c_bool
+use,intrinsic::cudafor,only:c_char
+use,intrinsic::cudafor,only:c_null_char
+use,intrinsic::cudafor,only:c_alert
+use,intrinsic::cudafor,only:c_backspace
+use,intrinsic::cudafor,only:c_form_feed
+use,intrinsic::cudafor,only:c_new_line
+use,intrinsic::cudafor,only:c_carriage_return
+use,intrinsic::cudafor,only:c_horizontal_tab
+use,intrinsic::cudafor,only:c_vertical_tab
+use,intrinsic::cudafor,only:c_float128
+use,intrinsic::cudafor,only:c_float128_complex
+use,intrinsic::cudafor,only:c_uint8_t
+use,intrinsic::cudafor,only:c_uint16_t
+use,intrinsic::cudafor,only:c_uint32_t
+use,intrinsic::cudafor,only:c_uint64_t
+use,intrinsic::cudafor,only:c_uint128_t
+use,intrinsic::cudafor,only:c_unsigned_char
+use,intrinsic::cudafor,only:c_unsigned_short
+use,intrinsic::cudafor,only:c_unsigned
+use,intrinsic::cudafor,only:c_unsigned_long
+use,intrinsic::cudafor,only:c_unsigned_long_long
+use,intrinsic::cudafor,only:c_uintmax_t
+use,intrinsic::cudafor,only:c_uint_fast8_t
+use,intrinsic::cudafor,only:c_uint_fast16_t
+use,intrinsic::cudafor,only:c_uint_fast32_t
+use,intrinsic::cudafor,only:c_uint_fast64_t
+use,intrinsic::cudafor,only:c_uint_fast128_t
+use,intrinsic::cudafor,only:c_uint_least8_t
+use,intrinsic::cudafor,only:c_uint_least16_t
+use,intrinsic::cudafor,only:c_uint_least32_t
+use,intrinsic::cudafor,only:c_uint_least64_t
+use,intrinsic::cudafor,only:c_uint_least128_t
+use,intrinsic::cudafor,only:c_f_procpointer
+use,intrinsic::cudafor,only:wmmahalf
+use,intrinsic::cudafor,only:c_null_devptr
+use,intrinsic::cudafor,only:cuda_r_16f
+use,intrinsic::cudafor,only:cuda_c_16f
+use,intrinsic::cudafor,only:cuda_r_16bf
+use,intrinsic::cudafor,only:cuda_c_16bf
+use,intrinsic::cudafor,only:cuda_r_32f
+use,intrinsic::cudafor,only:cuda_c_32f
+use,intrinsic::cudafor,only:cuda_r_64f
+use,intrinsic::cudafor,only:cuda_c_64f
+use,intrinsic::cudafor,only:cuda_r_4i
+use,intrinsic::cudafor,only:cuda_c_4i
+use,intrinsic::cudafor,only:cuda_r_4u
+use,intrinsic::cudafor,only:cuda_c_4u
+use,intrinsic::cudafor,only:cuda_r_8i
+use,intrinsic::cudafor,only:cuda_c_8i
+use,intrinsic::cudafor,only:cuda_r_8u
+use,intrinsic::cudafor,only:cuda_c_8u
+use,intrinsic::cudafor,only:cuda_r_16i
+use,intrinsic::cudafor,only:cuda_c_16i
+use,intrinsic::cudafor,only:cuda_r_16u
+use,intrinsic::cudafor,only:cuda_c_16u
+use,intrinsic::cudafor,only:cuda_r_32i
+use,intrinsic::cudafor,only:cuda_c_32i
+use,intrinsic::cudafor,only:cuda_r_32u
+use,intrinsic::cudafor,only:cuda_c_32u
+use,intrinsic::cudafor,only:cuda_r_64i
+use,intrinsic::cudafor,only:cuda_c_64i
+use,intrinsic::cudafor,only:cuda_r_64u
+use,intrinsic::cudafor,only:cuda_c_64u
+use,intrinsic::cudafor,only:major_version
+use,intrinsic::cudafor,only:minor_version
+use,intrinsic::cudafor,only:patch_level
+use,intrinsic::cudafor,only:cudadatatype
+use,intrinsic::cudafor,only:__pgf90_assign_int_to_dim3
+use,intrinsic::cudafor,only:compare_eq_cdevptrs
+use,intrinsic::cudafor,only:compare_ne_cdevptrs
+use,intrinsic::cudafor,only:compare_eq_cudadatatypes
+use,intrinsic::cudafor,only:compare_ne_cudadatatypes
+use,intrinsic::cudafor,only:cudasuccess
+use,intrinsic::cudafor,only:cudaerrorinvalidvalue
+use,intrinsic::cudafor,only:cudaerrormemoryallocation
+use,intrinsic::cudafor,only:cudaerrorinitializationerror
+use,intrinsic::cudafor,only:cudaerrorcudartunloading
+use,intrinsic::cudafor,only:cudaerrorprofilerdisabled
+use,intrinsic::cudafor,only:cudaerrorprofilernotinitialized
+use,intrinsic::cudafor,only:cudaerrorprofileralreadystarted
+use,intrinsic::cudafor,only:cudaerrorprofileralreadystopped
+use,intrinsic::cudafor,only:cudaerrorinvalidconfiguration
+use,intrinsic::cudafor,only:cudaerrorinvalidpitchvalue
+use,intrinsic::cudafor,only:cudaerrorinvalidsymbol
+use,intrinsic::cudafor,only:cudaerrorinvalidhostpointer
+use,intrinsic::cudafor,only:cudaerrorinvaliddevicepointer
+use,intrinsic::cudafor,only:cudaerrorinvalidtexture
+use,intrinsic::cudafor,only:cudaerrorinvalidtexturebinding
+use,intrinsic::cudafor,only:cudaerrorinvalidchanneldescriptor
+use,intrinsic::cudafor,only:cudaerrorinvalidmemcpydirection
+use,intrinsic::cudafor,only:cudaerroraddressofconstant
+use,intrinsic::cudafor,only:cudaerrortexturefetchfailed
+use,intrinsic::cudafor,only:cudaerrortexturenotbound
+use,intrinsic::cudafor,only:cudaerrorsynchronizationerror
+use,intrinsic::cudafor,only:cudaerrorinvalidfiltersetting
+use,intrinsic::cudafor,only:cudaerrorinvalidnormsetting
+use,intrinsic::cudafor,only:cudaerrormixeddeviceexecution
+use,intrinsic::cudafor,only:cudaerrornotyetimplemented
+use,intrinsic::cudafor,only:cudaerrormemoryvaluetoolarge
+use,intrinsic::cudafor,only:cudaerrorinsufficientdriver
+use,intrinsic::cudafor,only:cudaerrorinvalidsurface
+use,intrinsic::cudafor,only:cudaerrorduplicatevariablename
+use,intrinsic::cudafor,only:cudaerrorduplicatetexturename
+use,intrinsic::cudafor,only:cudaerrorduplicatesurfacename
+use,intrinsic::cudafor,only:cudaerrordevicesunavailable
+use,intrinsic::cudafor,only:cudaerrorincompatibledrivercontext
+use,intrinsic::cudafor,only:cudaerrormissingconfiguration
+use,intrinsic::cudafor,only:cudaerrorpriorlaunchfailure
+use,intrinsic::cudafor,only:cudaerrorlaunchmaxdepthexceeded
+use,intrinsic::cudafor,only:cudaerrorlaunchfilescopedtex
+use,intrinsic::cudafor,only:cudaerrorlaunchfilescopedsurf
+use,intrinsic::cudafor,only:cudaerrorsyncdepthexceeded
+use,intrinsic::cudafor,only:cudaerrorlaunchpendingcountexceeded
+use,intrinsic::cudafor,only:cudaerrorinvaliddevicefunction
+use,intrinsic::cudafor,only:cudaerrornodevice
+use,intrinsic::cudafor,only:cudaerrorinvaliddevice
+use,intrinsic::cudafor,only:cudaerrorstartupfailure
+use,intrinsic::cudafor,only:cudaerrorinvalidkernelimage
+use,intrinsic::cudafor,only:cudaerrordeviceuninitialized
+use,intrinsic::cudafor,only:cudaerrormapbufferobjectfailed
+use,intrinsic::cudafor,only:cudaerrorunmapbufferobjectfailed
+use,intrinsic::cudafor,only:cudaerrorarrayismapped
+use,intrinsic::cudafor,only:cudaerroralreadymapped
+use,intrinsic::cudafor,only:cudaerrornokernelimagefordevice
+use,intrinsic::cudafor,only:cudaerroralreadyacquired
+use,intrinsic::cudafor,only:cudaerrornotmapped
+use,intrinsic::cudafor,only:cudaerrornotmappedasarray
+use,intrinsic::cudafor,only:cudaerrornotmappedaspointer
+use,intrinsic::cudafor,only:cudaerroreccuncorrectable
+use,intrinsic::cudafor,only:cudaerrorunsupportedlimit
+use,intrinsic::cudafor,only:cudaerrordevicealreadyinuse
+use,intrinsic::cudafor,only:cudaerrorpeeraccessunsupported
+use,intrinsic::cudafor,only:cudaerrorinvalidptx
+use,intrinsic::cudafor,only:cudaerrorinvalidgraphicscontext
+use,intrinsic::cudafor,only:cudaerrornvlinkuncorrectable
+use,intrinsic::cudafor,only:cudaerrorjitcompilernotfound
+use,intrinsic::cudafor,only:cudaerrorinvalidsource
+use,intrinsic::cudafor,only:cudaerrorfilenotfound
+use,intrinsic::cudafor,only:cudaerrorsharedobjectsymbolnotfound
+use,intrinsic::cudafor,only:cudaerrorsharedobjectinitfailed
+use,intrinsic::cudafor,only:cudaerroroperatingsystem
+use,intrinsic::cudafor,only:cudaerrorinvalidresourcehandle
+use,intrinsic::cudafor,only:cudaerrorillegalstate
+use,intrinsic::cudafor,only:cudaerrorsymbolnotfound
+use,intrinsic::cudafor,only:cudaerrornotready
+use,intrinsic::cudafor,only:cudaerrorillegaladdress
+use,intrinsic::cudafor,only:cudaerrorlaunchoutofresources
+use,intrinsic::cudafor,only:cudaerrorlaunchtimeout
+use,intrinsic::cudafor,only:cudaerrorlaunchincompatibletexturing
+use,intrinsic::cudafor,only:cudaerrorpeeraccessalreadyenabled
+use,intrinsic::cudafor,only:cudaerrorpeeraccessnotenabled
+use,intrinsic::cudafor,only:cudaerrorsetonactiveprocess
+use,intrinsic::cudafor,only:cudaerrorcontextisdestroyed
+use,intrinsic::cudafor,only:cudaerrorassert
+use,intrinsic::cudafor,only:cudaerrortoomanypeers
+use,intrinsic::cudafor,only:cudaerrorhostmemoryalreadyregistered
+use,intrinsic::cudafor,only:cudaerrorhostmemorynotregistered
+use,intrinsic::cudafor,only:cudaerrorhardwarestackerror
+use,intrinsic::cudafor,only:cudaerrorillegalinstruction
+use,intrinsic::cudafor,only:cudaerrormisalignedaddress
+use,intrinsic::cudafor,only:cudaerrorinvalidaddressspace
+use,intrinsic::cudafor,only:cudaerrorinvalidpc
+use,intrinsic::cudafor,only:cudaerrorlaunchfailure
+use,intrinsic::cudafor,only:cudaerrorcooperativelaunchtoolarge
+use,intrinsic::cudafor,only:cudaerrornotpermitted
+use,intrinsic::cudafor,only:cudaerrornotsupported
+use,intrinsic::cudafor,only:cudaerrorsystemnotready
+use,intrinsic::cudafor,only:cudaerrorsystemdrivermismatch
+use,intrinsic::cudafor,only:cudaerrorcompatnotsupportedondevice
+use,intrinsic::cudafor,only:cudaerrorstreamcaptureunsupported
+use,intrinsic::cudafor,only:cudaerrorstreamcaptureinvalidated
+use,intrinsic::cudafor,only:cudaerrorstreamcapturemerge
+use,intrinsic::cudafor,only:cudaerrorstreamcaptureunmatched
+use,intrinsic::cudafor,only:cudaerrorstreamcaptureunjoined
+use,intrinsic::cudafor,only:cudaerrorstreamcaptureisolation
+use,intrinsic::cudafor,only:cudaerrorstreamcaptureimplicit
+use,intrinsic::cudafor,only:cudaerrorcapturedevent
+use,intrinsic::cudafor,only:cudaerrorstreamcapturewrongthread
+use,intrinsic::cudafor,only:cudaerrortimeout
+use,intrinsic::cudafor,only:cudaerrorgraphexecupdatefailure
+use,intrinsic::cudafor,only:cudaerrorunknown
+use,intrinsic::cudafor,only:cudaerrorapifailurebase
+use,intrinsic::cudafor,only:cudamemorytypeunregistered
+use,intrinsic::cudafor,only:cudamemorytypehost
+use,intrinsic::cudafor,only:cudamemorytypedevice
+use,intrinsic::cudafor,only:cudamemorytypemanaged
+use,intrinsic::cudafor,only:cudamemcpyhosttohost
+use,intrinsic::cudafor,only:cudamemcpyhosttodevice
+use,intrinsic::cudafor,only:cudamemcpydevicetohost
+use,intrinsic::cudafor,only:cudamemcpydevicetodevice
+use,intrinsic::cudafor,only:cudamemcpydefault
+use,intrinsic::cudafor,only:cudachannelformatkindsigned
+use,intrinsic::cudafor,only:cudachannelformatkindunsigned
+use,intrinsic::cudafor,only:cudachannelformatkindfloat
+use,intrinsic::cudafor,only:cudachannelformatkindnone
+use,intrinsic::cudafor,only:cudafunccacheprefernone
+use,intrinsic::cudafor,only:cudafunccacheprefershared
+use,intrinsic::cudafor,only:cudafunccachepreferl1
+use,intrinsic::cudafor,only:cudafunccachepreferequal
+use,intrinsic::cudafor,only:cudafuncattributemaxdynamicsharedmemorysize
+use,intrinsic::cudafor,only:cudafuncattributepreferredsharedmemorycarveout
+use,intrinsic::cudafor,only:cudafuncattributemax
+use,intrinsic::cudafor,only:cudasharedmemcarveoutdefault
+use,intrinsic::cudafor,only:cudasharedmemcarveoutmaxl1
+use,intrinsic::cudafor,only:cudasharedmemcarveoutmaxshared
+use,intrinsic::cudafor,only:cudalimitstacksize
+use,intrinsic::cudafor,only:cudalimitprintffifosize
+use,intrinsic::cudafor,only:cudalimitmallocheapsize
+use,intrinsic::cudafor,only:cudalimitdevruntimesyncdepth
+use,intrinsic::cudafor,only:cudalimitdevruntimependinglaunchcount
+use,intrinsic::cudafor,only:cudalimitmaxl2fetchgranularity
+use,intrinsic::cudafor,only:cudalimitpersistingl2cachesize
+use,intrinsic::cudafor,only:cudamemadvisesetreadmostly
+use,intrinsic::cudafor,only:cudamemadviseunsetreadmostly
+use,intrinsic::cudafor,only:cudamemadvisesetpreferredlocation
+use,intrinsic::cudafor,only:cudamemadviseunsetpreferredlocation
+use,intrinsic::cudafor,only:cudamemadvisesetaccessedby
+use,intrinsic::cudafor,only:cudamemadviseunsetaccessedby
+use,intrinsic::cudafor,only:cudamemlocationtypeinvalid
+use,intrinsic::cudafor,only:cudamemlocationtypenone
+use,intrinsic::cudafor,only:cudamemlocationtypedevice
+use,intrinsic::cudafor,only:cudamemlocationtypehost
+use,intrinsic::cudafor,only:cudamemlocationtypehostnuma
+use,intrinsic::cudafor,only:cudamemlocationtypehostnumacurrent
+use,intrinsic::cudafor,only:cudamemrangeattributereadmostly
+use,intrinsic::cudafor,only:cudamemrangeattributepreferredlocation
+use,intrinsic::cudafor,only:cudamemrangeattributeaccessedby
+use,intrinsic::cudafor,only:cudamemrangeattributelastprefetchlocation
+use,intrinsic::cudafor,only:cudaaccesspropertynormal
+use,intrinsic::cudafor,only:cudaaccesspropertystreaming
+use,intrinsic::cudafor,only:cudaaccesspropertypersisting
+use,intrinsic::cudafor,only:cudaaddressmodewrap
+use,intrinsic::cudafor,only:cudaaddressmodeclamp
+use,intrinsic::cudafor,only:cudaaddressmodemirror
+use,intrinsic::cudafor,only:cudaaddressmodeborder
+use,intrinsic::cudafor,only:cudafiltermodepoint
+use,intrinsic::cudafor,only:cudafiltermodelinear
+use,intrinsic::cudafor,only:cudasharedmembanksizedefault
+use,intrinsic::cudafor,only:cudasharedmembanksizefourbyte
+use,intrinsic::cudafor,only:cudasharedmembanksizeeightbyte
+use,intrinsic::cudafor,only:cudareadmodeelementtype
+use,intrinsic::cudafor,only:cudareadmodenormalizedfloat
+use,intrinsic::cudafor,only:cudacomputemodedefault
+use,intrinsic::cudafor,only:cudacomputemodeexclusive
+use,intrinsic::cudafor,only:cudacomputemodeprohibited
+use,intrinsic::cudafor,only:cudacomputemodeexclusiveprocess
+use,intrinsic::cudafor,only:cudadevattrmaxthreadsperblock
+use,intrinsic::cudafor,only:cudadevattrmaxblockdimx
+use,intrinsic::cudafor,only:cudadevattrmaxblockdimy
+use,intrinsic::cudafor,only:cudadevattrmaxblockdimz
+use,intrinsic::cudafor,only:cudadevattrmaxgriddimx
+use,intrinsic::cudafor,only:cudadevattrmaxgriddimy
+use,intrinsic::cudafor,only:cudadevattrmaxgriddimz
+use,intrinsic::cudafor,only:cudadevattrmaxsharedmemoryperblock
+use,intrinsic::cudafor,only:cudadevattrtotalconstantmemory
+use,intrinsic::cudafor,only:cudadevattrwarpsize
+use,intrinsic::cudafor,only:cudadevattrmaxpitch
+use,intrinsic::cudafor,only:cudadevattrmaxregistersperblock
+use,intrinsic::cudafor,only:cudadevattrclockrate
+use,intrinsic::cudafor,only:cudadevattrtexturealignment
+use,intrinsic::cudafor,only:cudadevattrgpuoverlap
+use,intrinsic::cudafor,only:cudadevattrmultiprocessorcount
+use,intrinsic::cudafor,only:cudadevattrkernelexectimeout
+use,intrinsic::cudafor,only:cudadevattrintegrated
+use,intrinsic::cudafor,only:cudadevattrcanmaphostmemory
+use,intrinsic::cudafor,only:cudadevattrcomputemode
+use,intrinsic::cudafor,only:cudadevattrmaxtexture1dwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dheight
+use,intrinsic::cudafor,only:cudadevattrmaxtexture3dwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture3dheight
+use,intrinsic::cudafor,only:cudadevattrmaxtexture3ddepth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dlayeredwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dlayeredheight
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dlayeredlayers
+use,intrinsic::cudafor,only:cudadevattrsurfacealignment
+use,intrinsic::cudafor,only:cudadevattrconcurrentkernels
+use,intrinsic::cudafor,only:cudadevattreccenabled
+use,intrinsic::cudafor,only:cudadevattrpcibusid
+use,intrinsic::cudafor,only:cudadevattrpcideviceid
+use,intrinsic::cudafor,only:cudadevattrtccdriver
+use,intrinsic::cudafor,only:cudadevattrmemoryclockrate
+use,intrinsic::cudafor,only:cudadevattrglobalmemorybuswidth
+use,intrinsic::cudafor,only:cudadevattrl2cachesize
+use,intrinsic::cudafor,only:cudadevattrmaxthreadspermultiprocessor
+use,intrinsic::cudafor,only:cudadevattrasyncenginecount
+use,intrinsic::cudafor,only:cudadevattrunifiedaddressing
+use,intrinsic::cudafor,only:cudadevattrmaxtexture1dlayeredwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture1dlayeredlayers
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dgatherwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dgatherheight
+use,intrinsic::cudafor,only:cudadevattrmaxtexture3dwidthalt
+use,intrinsic::cudafor,only:cudadevattrmaxtexture3dheightalt
+use,intrinsic::cudafor,only:cudadevattrmaxtexture3ddepthalt
+use,intrinsic::cudafor,only:cudadevattrpcidomainid
+use,intrinsic::cudafor,only:cudadevattrtexturepitchalignment
+use,intrinsic::cudafor,only:cudadevattrmaxtexturecubemapwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexturecubemaplayeredwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexturecubemaplayeredlayers
+use,intrinsic::cudafor,only:cudadevattrmaxsurface1dwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurface2dwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurface2dheight
+use,intrinsic::cudafor,only:cudadevattrmaxsurface3dwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurface3dheight
+use,intrinsic::cudafor,only:cudadevattrmaxsurface3ddepth
+use,intrinsic::cudafor,only:cudadevattrmaxsurface1dlayeredwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurface1dlayeredlayers
+use,intrinsic::cudafor,only:cudadevattrmaxsurface2dlayeredwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurface2dlayeredheight
+use,intrinsic::cudafor,only:cudadevattrmaxsurface2dlayeredlayers
+use,intrinsic::cudafor,only:cudadevattrmaxsurfacecubemapwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurfacecubemaplayeredwidth
+use,intrinsic::cudafor,only:cudadevattrmaxsurfacecubemaplayeredlayers
+use,intrinsic::cudafor,only:cudadevattrmaxtexture1dlinearwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dlinearwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dlinearheight
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dlinearpitch
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dmipmappedwidth
+use,intrinsic::cudafor,only:cudadevattrmaxtexture2dmipmappedheight
+use,intrinsic::cudafor,only:cudadevattrcomputecapabilitymajor
+use,intrinsic::cudafor,only:cudadevattrcomputecapabilityminor
+use,intrinsic::cudafor,only:cudadevattrmaxtexture1dmipmappedwidth
+use,intrinsic::cudafor,only:cudadevattrstreamprioritiessupported
+use,intrinsic::cudafor,only:cudadevattrgloball1cachesupported
+use,intrinsic::cudafor,only:cudadevattrlocall1cachesupported
+use,intrinsic::cudafor,only:cudadevattrmaxsharedmemorypermultiprocessor
+use,intrinsic::cudafor,only:cudadevattrmaxregisterspermultiprocessor
+use,intrinsic::cudafor,only:cudadevattrmanagedmemory
+use,intrinsic::cudafor,only:cudadevattrismultigpuboard
+use,intrinsic::cudafor,only:cudadevattrmultigpuboardgroupid
+use,intrinsic::cudafor,only:cudadevattrhostnativeatomicsupported
+use,intrinsic::cudafor,only:cudadevattrsingletodoubleprecisionperfratio
+use,intrinsic::cudafor,only:cudadevattrpageablememoryaccess
+use,intrinsic::cudafor,only:cudadevattrconcurrentmanagedaccess
+use,intrinsic::cudafor,only:cudadevattrcomputepreemptionsupported
+use,intrinsic::cudafor,only:cudadevattrcanusehostpointerforregisteredmem
+use,intrinsic::cudafor,only:cudadevattrreserved92
+use,intrinsic::cudafor,only:cudadevattrreserved93
+use,intrinsic::cudafor,only:cudadevattrreserved94
+use,intrinsic::cudafor,only:cudadevattrcooperativelaunch
+use,intrinsic::cudafor,only:cudadevattrcooperativemultidevicelaunch
+use,intrinsic::cudafor,only:cudadevattrmaxsharedmemoryperblockoptin
+use,intrinsic::cudafor,only:cudadevattrcanflushremotewrites
+use,intrinsic::cudafor,only:cudadevattrhostregistersupported
+use,intrinsic::cudafor,only:cudadevattrpageablememoryaccessuseshostpagetables
+use,intrinsic::cudafor,only:cudadevattrdirectmanagedmemaccessfromhost
+use,intrinsic::cudafor,only:cudadevattrmaxblockspermultiprocessor
+use,intrinsic::cudafor,only:cudadevattrmaxpersistingl2cachesize
+use,intrinsic::cudafor,only:cudadevattrmaxaccesspolicywindowsize
+use,intrinsic::cudafor,only:cudadevattrreservedsharedmemoryperblock
+use,intrinsic::cudafor,only:cudadevattrsparsecudaarraysupported
+use,intrinsic::cudafor,only:cudadevattrhostregisterreadonlysupported
+use,intrinsic::cudafor,only:cudadevattrmaxtimelinesemaphoreinteropsupported
+use,intrinsic::cudafor,only:cudadevattrmemorypoolssupported
+use,intrinsic::cudafor,only:cudadevattrgpudirectrdmasupported
+use,intrinsic::cudafor,only:cudadevattrgpudirectrdmaflushwritesoptions
+use,intrinsic::cudafor,only:cudadevattrgpudirectrdmawritesordering
+use,intrinsic::cudafor,only:cudadevattrmemorypoolsupportedhandletypes
+use,intrinsic::cudafor,only:cudadevattrclusterlaunch
+use,intrinsic::cudafor,only:cudadevattrdeferredmappingcudaarraysupported
+use,intrinsic::cudafor,only:cudadevattrreserved122
+use,intrinsic::cudafor,only:cudadevattrreserved123
+use,intrinsic::cudafor,only:cudadevattrreserved124
+use,intrinsic::cudafor,only:cudadevattripceventsupport
+use,intrinsic::cudafor,only:cudadevattrmemsyncdomaincount
+use,intrinsic::cudafor,only:cudadevattrreserved127
+use,intrinsic::cudafor,only:cudadevattrreserved128
+use,intrinsic::cudafor,only:cudadevattrreserved129
+use,intrinsic::cudafor,only:cudadevattrnumaconfig
+use,intrinsic::cudafor,only:cudadevattrnumaid
+use,intrinsic::cudafor,only:cudadevattrreserved132
+use,intrinsic::cudafor,only:cudadevattrmpsenabled
+use,intrinsic::cudafor,only:cudadevattrhostnumaid
+use,intrinsic::cudafor,only:cudadevattrd3d12cigsupported
+use,intrinsic::cudafor,only:cudadevattrvulkancigsupported
+use,intrinsic::cudafor,only:cudadevattrgpupcideviceid
+use,intrinsic::cudafor,only:cudadevattrgpupcisubsystemid
+use,intrinsic::cudafor,only:cudadevattrreserved141
+use,intrinsic::cudafor,only:cudadevattrhostnumamemorypoolssupported
+use,intrinsic::cudafor,only:cudadevattrhostnumamultinodeipcsupported
+use,intrinsic::cudafor,only:cudadevattrhostmemorypoolssupported
+use,intrinsic::cudafor,only:cudadevattrreserved145
+use,intrinsic::cudafor,only:cudadevattronlypartialhostnativeatomicsupported
+use,intrinsic::cudafor,only:cudadevattrmax
+use,intrinsic::cudafor,only:cudastreamcapturestatusnone
+use,intrinsic::cudafor,only:cudastreamcapturestatusactive
+use,intrinsic::cudafor,only:cudastreamcapturestatusinvalidated
+use,intrinsic::cudafor,only:cudastreamcapturemodeglobal
+use,intrinsic::cudafor,only:cudastreamcapturemodethreadlocal
+use,intrinsic::cudafor,only:cudastreamcapturemoderelaxed
+use,intrinsic::cudafor,only:cudasyncpolicyauto
+use,intrinsic::cudafor,only:cudasyncpolicyspin
+use,intrinsic::cudafor,only:cudasyncpolicyyield
+use,intrinsic::cudafor,only:cudasyncpolicyblockingsync
+use,intrinsic::cudafor,only:cudahostallocdefault
+use,intrinsic::cudafor,only:cudahostallocportable
+use,intrinsic::cudafor,only:cudahostallocmapped
+use,intrinsic::cudafor,only:cudahostallocwritecombined
+use,intrinsic::cudafor,only:cudahostregisterdefault
+use,intrinsic::cudafor,only:cudahostregisterportable
+use,intrinsic::cudafor,only:cudahostregistermapped
+use,intrinsic::cudafor,only:cudahostregisteriomemory
+use,intrinsic::cudafor,only:cudapeeraccessdefault
+use,intrinsic::cudafor,only:cudadevicescheduleauto
+use,intrinsic::cudafor,only:cudadeviceschedulespin
+use,intrinsic::cudafor,only:cudadevicescheduleyield
+use,intrinsic::cudafor,only:cudadeviceblockingsync
+use,intrinsic::cudafor,only:cudadevicescheduleblockingsync
+use,intrinsic::cudafor,only:cudadevicemaphost
+use,intrinsic::cudafor,only:cudastreamdefault
+use,intrinsic::cudafor,only:cudastreamnonblocking
+use,intrinsic::cudafor,only:cudaeventdefault
+use,intrinsic::cudafor,only:cudaeventblockingsync
+use,intrinsic::cudafor,only:cudaeventdisabletiming
+use,intrinsic::cudafor,only:cudaeventinterprocess
+use,intrinsic::cudafor,only:cudatexturetype1d
+use,intrinsic::cudafor,only:cudatexturetype2d
+use,intrinsic::cudafor,only:cudatexturetype3d
+use,intrinsic::cudafor,only:cudamemattachglobal
+use,intrinsic::cudafor,only:cudamemattachhost
+use,intrinsic::cudafor,only:cudamemattachsingle
+use,intrinsic::cudafor,only:cudaoccupancydefault
+use,intrinsic::cudafor,only:cudaoccupancydisablecachingoverride
+use,intrinsic::cudafor,only:cudacpudeviceid
+use,intrinsic::cudafor,only:cudainvaliddeviceid
+use,intrinsic::cudafor,only:cudamempoolreusefolloweventdependencies
+use,intrinsic::cudafor,only:cudamempoolreuseallowopportunistic
+use,intrinsic::cudafor,only:cudamempoolreuseallowinternaldependencies
+use,intrinsic::cudafor,only:cudamempoolattrreleasethreshold
+use,intrinsic::cudafor,only:cudadevp2pattrperformancerank
+use,intrinsic::cudafor,only:cudadevp2pattraccesssupported
+use,intrinsic::cudafor,only:cudadevp2pattrnativeatomicsupported
+use,intrinsic::cudafor,only:cudadevp2pattrcudaarrayaccesssupported
+use,intrinsic::cudafor,only:cudalaunchattributeignore
+use,intrinsic::cudafor,only:cudalaunchattributeaccesspolicywindow
+use,intrinsic::cudafor,only:cudalaunchattributecooperative
+use,intrinsic::cudafor,only:cudalaunchattributesynchronizationpolicy
+use,intrinsic::cudafor,only:cudalaunchattributeclusterdimension
+use,intrinsic::cudafor,only:cudalaunchattributeclusterschedulingpolicypreference
+use,intrinsic::cudafor,only:cudalaunchattributeprogrammaticstreamserialization
+use,intrinsic::cudafor,only:cudalaunchattributeprogrammaticevent
+use,intrinsic::cudafor,only:cudalaunchattributepriority
+use,intrinsic::cudafor,only:cuda_graph_instantiate_flag_auto_free_on_launch
+use,intrinsic::cudafor,only:cuda_graph_instantiate_flag_upload
+use,intrinsic::cudafor,only:cuda_graph_instantiate_flag_device_launch
+use,intrinsic::cudafor,only:cuda_graph_instantiate_flag_use_node_priority
+use,intrinsic::cudafor,only:cuda_event_kind
+use,intrinsic::cudafor,only:int_ptr_kind
+use,intrinsic::cudafor,only:cuda_count_kind
+use,intrinsic::cudafor,only:cuda_stream_kind
+use,intrinsic::cudafor,only:cudaarrayptr
+use,intrinsic::cudafor,only:cudadeviceprop
+use,intrinsic::cudafor,only:cudadeviceprop130
+use,intrinsic::cudafor,only:cudaevent
+use,intrinsic::cudafor,only:cudaipcmemhandle
+use,intrinsic::cudafor,only:cudaipceventhandle
+use,intrinsic::cudafor,only:cudapitchedptr
+use,intrinsic::cudafor,only:cudachannelformatdesc
+use,intrinsic::cudafor,only:cudaextent
+use,intrinsic::cudafor,only:cudapos
+use,intrinsic::cudafor,only:cudamemcpy3dparms
+use,intrinsic::cudafor,only:cudamemcpy3dpeerparms
+use,intrinsic::cudafor,only:cudapointerattributes
+use,intrinsic::cudafor,only:cudafuncattributes
+use,intrinsic::cudafor,only:cudatexturereference
+use,intrinsic::cudafor,only:cudagraph
+use,intrinsic::cudafor,only:cudagraphexec
+use,intrinsic::cudafor,only:cudagraphnode
+use,intrinsic::cudafor,only:cudamempool
+use,intrinsic::cudafor,only:cudamemlocation
+use,intrinsic::cudafor,only:cudamempoolprops
+use,intrinsic::cudafor,only:cudalaunchconfig
+use,intrinsic::cudafor,only:cudastreamlegacy
+use,intrinsic::cudafor,only:cudastreamperthread
+use,intrinsic::cudafor,only:cudadevicegetattribute
+use,intrinsic::cudafor,only:cudadevicegetcacheconfig
+use,intrinsic::cudafor,only:cudadevicegetlimit
+use,intrinsic::cudafor,only:cudadevicereset
+use,intrinsic::cudafor,only:cudadevicesetcacheconfig
+use,intrinsic::cudafor,only:cudadevicegetsharedmemconfig
+use,intrinsic::cudafor,only:cudadevicesetsharedmemconfig
+use,intrinsic::cudafor,only:cudadevicesetlimit
+use,intrinsic::cudafor,only:cudadevicesynchronize
+use,intrinsic::cudafor,only:cudagetdevice
+use,intrinsic::cudafor,only:cudagetdevicecount
+use,intrinsic::cudafor,only:cudagetdeviceproperties130
+use,intrinsic::cudafor,only:cudasetdevice
+use,intrinsic::cudafor,only:cudasetdeviceflags
+use,intrinsic::cudafor,only:cudasetvaliddevices
+use,intrinsic::cudafor,only:cudadevicegetstreampriorityrange
+use,intrinsic::cudafor,only:cudathreadsynchronize
+use,intrinsic::cudafor,only:cudathreadexit
+use,intrinsic::cudafor,only:cudastreamcreatei8
+use,intrinsic::cudafor,only:cudastreamcreatewithflags
+use,intrinsic::cudafor,only:cudastreamcreatewithpriority
+use,intrinsic::cudafor,only:cudastreamquery
+use,intrinsic::cudafor,only:cudastreamsynchronizenull
+use,intrinsic::cudafor,only:cudastreamdestroy
+use,intrinsic::cudafor,only:cudastreamwaitevent
+use,intrinsic::cudafor,only:cudastreamattachmemasynccd
+use,intrinsic::cudafor,only:cudasetstreamdefault
+use,intrinsic::cudafor,only:cudasetstreamarray
+use,intrinsic::cudafor,only:cudagetstreamdefaultarg
+use,intrinsic::cudafor,only:cudagetstreamdefaultnull
+use,intrinsic::cudafor,only:cudastreamgetpriority
+use,intrinsic::cudafor,only:cudastreambegincapture
+use,intrinsic::cudafor,only:cudastreamendcapture
+use,intrinsic::cudafor,only:cudastreamiscapturing
+use,intrinsic::cudafor,only:cudastreamgetcaptureinfo
+use,intrinsic::cudafor,only:cudamallochost
+use,intrinsic::cudafor,only:cudahostalloc
+use,intrinsic::cudafor,only:cudafreehost
+use,intrinsic::cudafor,only:cudahostgetdevicepointer
+use,intrinsic::cudafor,only:cudahostgetflags
+use,intrinsic::cudafor,only:cudahostregister
+use,intrinsic::cudafor,only:cudahostunregister
+use,intrinsic::cudafor,only:cudamalloci1
+use,intrinsic::cudafor,only:cudamalloci2
+use,intrinsic::cudafor,only:cudamalloci4
+use,intrinsic::cudafor,only:cudamalloci8
+use,intrinsic::cudafor,only:cudamallocl1
+use,intrinsic::cudafor,only:cudamallocl2
+use,intrinsic::cudafor,only:cudamallocl4
+use,intrinsic::cudafor,only:cudamallocl8
+use,intrinsic::cudafor,only:cudamallocr2
+use,intrinsic::cudafor,only:cudamallocr4
+use,intrinsic::cudafor,only:cudamallocr8
+use,intrinsic::cudafor,only:cudamallocc4
+use,intrinsic::cudafor,only:cudamallocc8
+use,intrinsic::cudafor,only:cudamallocc1
+use,intrinsic::cudafor,only:cudamalloccd
+use,intrinsic::cudafor,only:cudamallocmngi1
+use,intrinsic::cudafor,only:cudamallocmngi2
+use,intrinsic::cudafor,only:cudamallocmngi4
+use,intrinsic::cudafor,only:cudamallocmngi8
+use,intrinsic::cudafor,only:cudamallocmngl1
+use,intrinsic::cudafor,only:cudamallocmngl2
+use,intrinsic::cudafor,only:cudamallocmngl4
+use,intrinsic::cudafor,only:cudamallocmngl8
+use,intrinsic::cudafor,only:cudamallocmngr2
+use,intrinsic::cudafor,only:cudamallocmngr4
+use,intrinsic::cudafor,only:cudamallocmngr8
+use,intrinsic::cudafor,only:cudamallocmngc4
+use,intrinsic::cudafor,only:cudamallocmngc8
+use,intrinsic::cudafor,only:cudamallocmngc1
+use,intrinsic::cudafor,only:cudamallocmngcd
+use,intrinsic::cudafor,only:cudamallocasynci1
+use,intrinsic::cudafor,only:cudamallocasynci2
+use,intrinsic::cudafor,only:cudamallocasynci4
+use,intrinsic::cudafor,only:cudamallocasynci8
+use,intrinsic::cudafor,only:cudamallocasyncl1
+use,intrinsic::cudafor,only:cudamallocasyncl2
+use,intrinsic::cudafor,only:cudamallocasyncl4
+use,intrinsic::cudafor,only:cudamallocasyncl8
+use,intrinsic::cudafor,only:cudamallocasyncr2
+use,intrinsic::cudafor,only:cudamallocasyncr4
+use,intrinsic::cudafor,only:cudamallocasyncr8
+use,intrinsic::cudafor,only:cudamallocasyncc4
+use,intrinsic::cudafor,only:cudamallocasyncc8
+use,intrinsic::cudafor,only:cudamallocasyncc1
+use,intrinsic::cudafor,only:cudamallocasynccd
+use,intrinsic::cudafor,only:cudamemadvisei1
+use,intrinsic::cudafor,only:cudamemadvisei2
+use,intrinsic::cudafor,only:cudamemadvisei4
+use,intrinsic::cudafor,only:cudamemadvisei8
+use,intrinsic::cudafor,only:cudamemadvisel1
+use,intrinsic::cudafor,only:cudamemadvisel2
+use,intrinsic::cudafor,only:cudamemadvisel4
+use,intrinsic::cudafor,only:cudamemadvisel8
+use,intrinsic::cudafor,only:cudamemadviser2
+use,intrinsic::cudafor,only:cudamemadviser4
+use,intrinsic::cudafor,only:cudamemadviser8
+use,intrinsic::cudafor,only:cudamemadvisec4
+use,intrinsic::cudafor,only:cudamemadvisec8
+use,intrinsic::cudafor,only:cudamemadvisec1
+use,intrinsic::cudafor,only:cudamemadvisecd
+use,intrinsic::cudafor,only:cudamemadvisei1_v2_1
+use,intrinsic::cudafor,only:cudamemadvisei2_v2_1
+use,intrinsic::cudafor,only:cudamemadvisei4_v2_1
+use,intrinsic::cudafor,only:cudamemadvisei8_v2_1
+use,intrinsic::cudafor,only:cudamemadvisel1_v2_1
+use,intrinsic::cudafor,only:cudamemadvisel2_v2_1
+use,intrinsic::cudafor,only:cudamemadvisel4_v2_1
+use,intrinsic::cudafor,only:cudamemadvisel8_v2_1
+use,intrinsic::cudafor,only:cudamemadviser2_v2_1
+use,intrinsic::cudafor,only:cudamemadviser4_v2_1
+use,intrinsic::cudafor,only:cudamemadviser8_v2_1
+use,intrinsic::cudafor,only:cudamemadvisec4_v2_1
+use,intrinsic::cudafor,only:cudamemadvisec8_v2_1
+use,intrinsic::cudafor,only:cudamemadvisec1_v2_1
+use,intrinsic::cudafor,only:cudamemadvisecd_v2_1
+use,intrinsic::cudafor,only:cudamemadvisei1_v2
+use,intrinsic::cudafor,only:cudamemadvisei2_v2
+use,intrinsic::cudafor,only:cudamemadvisei4_v2
+use,intrinsic::cudafor,only:cudamemadvisei8_v2
+use,intrinsic::cudafor,only:cudamemadvisel1_v2
+use,intrinsic::cudafor,only:cudamemadvisel2_v2
+use,intrinsic::cudafor,only:cudamemadvisel4_v2
+use,intrinsic::cudafor,only:cudamemadvisel8_v2
+use,intrinsic::cudafor,only:cudamemadviser2_v2
+use,intrinsic::cudafor,only:cudamemadviser4_v2
+use,intrinsic::cudafor,only:cudamemadviser8_v2
+use,intrinsic::cudafor,only:cudamemadvisec4_v2
+use,intrinsic::cudafor,only:cudamemadvisec8_v2
+use,intrinsic::cudafor,only:cudamemadvisec1_v2
+use,intrinsic::cudafor,only:cudamemadvisecd_v2
+use,intrinsic::cudafor,only:cudamemprefetchasynci1
+use,intrinsic::cudafor,only:cudamemprefetchasynci2
+use,intrinsic::cudafor,only:cudamemprefetchasynci4
+use,intrinsic::cudafor,only:cudamemprefetchasynci8
+use,intrinsic::cudafor,only:cudamemprefetchasyncl1
+use,intrinsic::cudafor,only:cudamemprefetchasyncl2
+use,intrinsic::cudafor,only:cudamemprefetchasyncl4
+use,intrinsic::cudafor,only:cudamemprefetchasyncl8
+use,intrinsic::cudafor,only:cudamemprefetchasyncr2
+use,intrinsic::cudafor,only:cudamemprefetchasyncr4
+use,intrinsic::cudafor,only:cudamemprefetchasyncr8
+use,intrinsic::cudafor,only:cudamemprefetchasyncc4
+use,intrinsic::cudafor,only:cudamemprefetchasyncc8
+use,intrinsic::cudafor,only:cudamemprefetchasyncc1
+use,intrinsic::cudafor,only:cudamemprefetchasynccd
+use,intrinsic::cudafor,only:cudamemprefetchasynci1_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasynci2_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasynci4_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasynci8_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncl1_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncl2_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncl4_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncl8_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncr2_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncr4_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncr8_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncc4_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncc8_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasyncc1_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasynccd_v2_1
+use,intrinsic::cudafor,only:cudamemprefetchasynci1_v2
+use,intrinsic::cudafor,only:cudamemprefetchasynci2_v2
+use,intrinsic::cudafor,only:cudamemprefetchasynci4_v2
+use,intrinsic::cudafor,only:cudamemprefetchasynci8_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncl1_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncl2_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncl4_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncl8_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncr2_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncr4_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncr8_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncc4_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncc8_v2
+use,intrinsic::cudafor,only:cudamemprefetchasyncc1_v2
+use,intrinsic::cudafor,only:cudamemprefetchasynccd_v2
+use,intrinsic::cudafor,only:cudamemrangegetattributei1a
+use,intrinsic::cudafor,only:cudamemrangegetattributei1b
+use,intrinsic::cudafor,only:cudamemrangegetattributei2a
+use,intrinsic::cudafor,only:cudamemrangegetattributei2b
+use,intrinsic::cudafor,only:cudamemrangegetattributei4a
+use,intrinsic::cudafor,only:cudamemrangegetattributei4b
+use,intrinsic::cudafor,only:cudamemrangegetattributei8a
+use,intrinsic::cudafor,only:cudamemrangegetattributei8b
+use,intrinsic::cudafor,only:cudamemrangegetattributel1a
+use,intrinsic::cudafor,only:cudamemrangegetattributel1b
+use,intrinsic::cudafor,only:cudamemrangegetattributel2a
+use,intrinsic::cudafor,only:cudamemrangegetattributel2b
+use,intrinsic::cudafor,only:cudamemrangegetattributel4a
+use,intrinsic::cudafor,only:cudamemrangegetattributel4b
+use,intrinsic::cudafor,only:cudamemrangegetattributel8a
+use,intrinsic::cudafor,only:cudamemrangegetattributel8b
+use,intrinsic::cudafor,only:cudamemrangegetattributer2a
+use,intrinsic::cudafor,only:cudamemrangegetattributer2b
+use,intrinsic::cudafor,only:cudamemrangegetattributer4a
+use,intrinsic::cudafor,only:cudamemrangegetattributer4b
+use,intrinsic::cudafor,only:cudamemrangegetattributer8a
+use,intrinsic::cudafor,only:cudamemrangegetattributer8b
+use,intrinsic::cudafor,only:cudamemrangegetattributec4a
+use,intrinsic::cudafor,only:cudamemrangegetattributec4b
+use,intrinsic::cudafor,only:cudamemrangegetattributec8a
+use,intrinsic::cudafor,only:cudamemrangegetattributec8b
+use,intrinsic::cudafor,only:cudamemrangegetattributec1a
+use,intrinsic::cudafor,only:cudamemrangegetattributec1b
+use,intrinsic::cudafor,only:cudamemrangegetattributecda
+use,intrinsic::cudafor,only:cudamemrangegetattributecdb
+use,intrinsic::cudafor,only:cudafreei1
+use,intrinsic::cudafor,only:cudafreei2
+use,intrinsic::cudafor,only:cudafreei4
+use,intrinsic::cudafor,only:cudafreei8
+use,intrinsic::cudafor,only:cudafreel1
+use,intrinsic::cudafor,only:cudafreel2
+use,intrinsic::cudafor,only:cudafreel4
+use,intrinsic::cudafor,only:cudafreel8
+use,intrinsic::cudafor,only:cudafreer2
+use,intrinsic::cudafor,only:cudafreer4
+use,intrinsic::cudafor,only:cudafreer8
+use,intrinsic::cudafor,only:cudafreec4
+use,intrinsic::cudafor,only:cudafreec8
+use,intrinsic::cudafor,only:cudafreec1
+use,intrinsic::cudafor,only:cudafreecd
+use,intrinsic::cudafor,only:cudafreeasynci1
+use,intrinsic::cudafor,only:cudafreeasynci2
+use,intrinsic::cudafor,only:cudafreeasynci4
+use,intrinsic::cudafor,only:cudafreeasynci8
+use,intrinsic::cudafor,only:cudafreeasyncl1
+use,intrinsic::cudafor,only:cudafreeasyncl2
+use,intrinsic::cudafor,only:cudafreeasyncl4
+use,intrinsic::cudafor,only:cudafreeasyncl8
+use,intrinsic::cudafor,only:cudafreeasyncr2
+use,intrinsic::cudafor,only:cudafreeasyncr4
+use,intrinsic::cudafor,only:cudafreeasyncr8
+use,intrinsic::cudafor,only:cudafreeasyncc4
+use,intrinsic::cudafor,only:cudafreeasyncc8
+use,intrinsic::cudafor,only:cudafreeasyncc1
+use,intrinsic::cudafor,only:cudafreeasynccd
+use,intrinsic::cudafor,only:cudamallocpitchi1
+use,intrinsic::cudafor,only:cudamallocpitchi2
+use,intrinsic::cudafor,only:cudamallocpitchi4
+use,intrinsic::cudafor,only:cudamallocpitchi8
+use,intrinsic::cudafor,only:cudamallocpitchl1
+use,intrinsic::cudafor,only:cudamallocpitchl2
+use,intrinsic::cudafor,only:cudamallocpitchl4
+use,intrinsic::cudafor,only:cudamallocpitchl8
+use,intrinsic::cudafor,only:cudamallocpitchr2
+use,intrinsic::cudafor,only:cudamallocpitchr4
+use,intrinsic::cudafor,only:cudamallocpitchr8
+use,intrinsic::cudafor,only:cudamallocpitchc4
+use,intrinsic::cudafor,only:cudamallocpitchc8
+use,intrinsic::cudafor,only:cudamallocpitchc1
+use,intrinsic::cudafor,only:cudamallocpitchcd
+use,intrinsic::cudafor,only:cudamemseti1
+use,intrinsic::cudafor,only:cudamemsetmngi1
+use,intrinsic::cudafor,only:cudamemseti2
+use,intrinsic::cudafor,only:cudamemsetmngi2
+use,intrinsic::cudafor,only:cudamemseti4
+use,intrinsic::cudafor,only:cudamemsetmngi4
+use,intrinsic::cudafor,only:cudamemseti8
+use,intrinsic::cudafor,only:cudamemsetmngi8
+use,intrinsic::cudafor,only:cudamemsetl1
+use,intrinsic::cudafor,only:cudamemsetmngl1
+use,intrinsic::cudafor,only:cudamemsetl2
+use,intrinsic::cudafor,only:cudamemsetmngl2
+use,intrinsic::cudafor,only:cudamemsetl4
+use,intrinsic::cudafor,only:cudamemsetmngl4
+use,intrinsic::cudafor,only:cudamemsetl8
+use,intrinsic::cudafor,only:cudamemsetmngl8
+use,intrinsic::cudafor,only:cudamemsetr2
+use,intrinsic::cudafor,only:cudamemsetmngr2
+use,intrinsic::cudafor,only:cudamemsetr4
+use,intrinsic::cudafor,only:cudamemsetmngr4
+use,intrinsic::cudafor,only:cudamemsetr8
+use,intrinsic::cudafor,only:cudamemsetmngr8
+use,intrinsic::cudafor,only:cudamemsetc4
+use,intrinsic::cudafor,only:cudamemsetmngc4
+use,intrinsic::cudafor,only:cudamemsetc8
+use,intrinsic::cudafor,only:cudamemsetmngc8
+use,intrinsic::cudafor,only:cudamemsetc1
+use,intrinsic::cudafor,only:cudamemsetmngc1
+use,intrinsic::cudafor,only:cudamemsetcd
+use,intrinsic::cudafor,only:cudamemsetcdapi
+use,intrinsic::cudafor,only:cudamemsetasynci1
+use,intrinsic::cudafor,only:cudamemsetasynci2
+use,intrinsic::cudafor,only:cudamemsetasynci4
+use,intrinsic::cudafor,only:cudamemsetasynci8
+use,intrinsic::cudafor,only:cudamemsetasyncl1
+use,intrinsic::cudafor,only:cudamemsetasyncl2
+use,intrinsic::cudafor,only:cudamemsetasyncl4
+use,intrinsic::cudafor,only:cudamemsetasyncl8
+use,intrinsic::cudafor,only:cudamemsetasyncr2
+use,intrinsic::cudafor,only:cudamemsetasyncr4
+use,intrinsic::cudafor,only:cudamemsetasyncr8
+use,intrinsic::cudafor,only:cudamemsetasyncc4
+use,intrinsic::cudafor,only:cudamemsetasyncc8
+use,intrinsic::cudafor,only:cudamemsetasyncc1
+use,intrinsic::cudafor,only:cudamemsetasynccd
+use,intrinsic::cudafor,only:cudamemsetasynccdapi
+use,intrinsic::cudafor,only:cudamemset2di1
+use,intrinsic::cudafor,only:cudamemset2dmngi1
+use,intrinsic::cudafor,only:cudamemset2di2
+use,intrinsic::cudafor,only:cudamemset2dmngi2
+use,intrinsic::cudafor,only:cudamemset2di4
+use,intrinsic::cudafor,only:cudamemset2dmngi4
+use,intrinsic::cudafor,only:cudamemset2di8
+use,intrinsic::cudafor,only:cudamemset2dmngi8
+use,intrinsic::cudafor,only:cudamemset2dl1
+use,intrinsic::cudafor,only:cudamemset2dmngl1
+use,intrinsic::cudafor,only:cudamemset2dl2
+use,intrinsic::cudafor,only:cudamemset2dmngl2
+use,intrinsic::cudafor,only:cudamemset2dl4
+use,intrinsic::cudafor,only:cudamemset2dmngl4
+use,intrinsic::cudafor,only:cudamemset2dl8
+use,intrinsic::cudafor,only:cudamemset2dmngl8
+use,intrinsic::cudafor,only:cudamemset2dr2
+use,intrinsic::cudafor,only:cudamemset2dmngr2
+use,intrinsic::cudafor,only:cudamemset2dr4
+use,intrinsic::cudafor,only:cudamemset2dmngr4
+use,intrinsic::cudafor,only:cudamemset2dr8
+use,intrinsic::cudafor,only:cudamemset2dmngr8
+use,intrinsic::cudafor,only:cudamemset2dc4
+use,intrinsic::cudafor,only:cudamemset2dmngc4
+use,intrinsic::cudafor,only:cudamemset2dc8
+use,intrinsic::cudafor,only:cudamemset2dmngc8
+use,intrinsic::cudafor,only:cudamemset2dc1
+use,intrinsic::cudafor,only:cudamemset2dmngc1
+use,intrinsic::cudafor,only:cudamemset2dcd
+use,intrinsic::cudafor,only:cudamemset2dasynci1
+use,intrinsic::cudafor,only:cudamemset2dasynci2
+use,intrinsic::cudafor,only:cudamemset2dasynci4
+use,intrinsic::cudafor,only:cudamemset2dasynci8
+use,intrinsic::cudafor,only:cudamemset2dasyncl1
+use,intrinsic::cudafor,only:cudamemset2dasyncl2
+use,intrinsic::cudafor,only:cudamemset2dasyncl4
+use,intrinsic::cudafor,only:cudamemset2dasyncl8
+use,intrinsic::cudafor,only:cudamemset2dasyncr2
+use,intrinsic::cudafor,only:cudamemset2dasyncr4
+use,intrinsic::cudafor,only:cudamemset2dasyncr8
+use,intrinsic::cudafor,only:cudamemset2dasyncc4
+use,intrinsic::cudafor,only:cudamemset2dasyncc8
+use,intrinsic::cudafor,only:cudamemset2dasyncc1
+use,intrinsic::cudafor,only:cudamemset2dasynccd
+use,intrinsic::cudafor,only:cudamemcpyi1
+use,intrinsic::cudafor,only:cudamemcpyi2
+use,intrinsic::cudafor,only:cudamemcpyi4
+use,intrinsic::cudafor,only:cudamemcpyi8
+use,intrinsic::cudafor,only:cudamemcpyl1
+use,intrinsic::cudafor,only:cudamemcpyl2
+use,intrinsic::cudafor,only:cudamemcpyl4
+use,intrinsic::cudafor,only:cudamemcpyl8
+use,intrinsic::cudafor,only:cudamemcpyr2
+use,intrinsic::cudafor,only:cudamemcpyr4
+use,intrinsic::cudafor,only:cudamemcpyr8
+use,intrinsic::cudafor,only:cudamemcpyc4
+use,intrinsic::cudafor,only:cudamemcpyc8
+use,intrinsic::cudafor,only:cudamemcpyc1
+use,intrinsic::cudafor,only:cudamemcpycdin
+use,intrinsic::cudafor,only:cudamemcpycdout
+use,intrinsic::cudafor,only:cudamemcpycdover
+use,intrinsic::cudafor,only:cudamemcpyi1in
+use,intrinsic::cudafor,only:cudamemcpyi2in
+use,intrinsic::cudafor,only:cudamemcpyi4in
+use,intrinsic::cudafor,only:cudamemcpyi8in
+use,intrinsic::cudafor,only:cudamemcpyl1in
+use,intrinsic::cudafor,only:cudamemcpyl2in
+use,intrinsic::cudafor,only:cudamemcpyl4in
+use,intrinsic::cudafor,only:cudamemcpyl8in
+use,intrinsic::cudafor,only:cudamemcpyr2in
+use,intrinsic::cudafor,only:cudamemcpyr4in
+use,intrinsic::cudafor,only:cudamemcpyr8in
+use,intrinsic::cudafor,only:cudamemcpyc4in
+use,intrinsic::cudafor,only:cudamemcpyc8in
+use,intrinsic::cudafor,only:cudamemcpyc1in
+use,intrinsic::cudafor,only:cudamemcpyi1out
+use,intrinsic::cudafor,only:cudamemcpyi2out
+use,intrinsic::cudafor,only:cudamemcpyi4out
+use,intrinsic::cudafor,only:cudamemcpyi8out
+use,intrinsic::cudafor,only:cudamemcpyl1out
+use,intrinsic::cudafor,only:cudamemcpyl2out
+use,intrinsic::cudafor,only:cudamemcpyl4out
+use,intrinsic::cudafor,only:cudamemcpyl8out
+use,intrinsic::cudafor,only:cudamemcpyr2out
+use,intrinsic::cudafor,only:cudamemcpyr4out
+use,intrinsic::cudafor,only:cudamemcpyr8out
+use,intrinsic::cudafor,only:cudamemcpyc4out
+use,intrinsic::cudafor,only:cudamemcpyc8out
+use,intrinsic::cudafor,only:cudamemcpyc1out
+use,intrinsic::cudafor,only:cudamemcpyi1over
+use,intrinsic::cudafor,only:cudamemcpyi2over
+use,intrinsic::cudafor,only:cudamemcpyi4over
+use,intrinsic::cudafor,only:cudamemcpyi8over
+use,intrinsic::cudafor,only:cudamemcpyl1over
+use,intrinsic::cudafor,only:cudamemcpyl2over
+use,intrinsic::cudafor,only:cudamemcpyl4over
+use,intrinsic::cudafor,only:cudamemcpyl8over
+use,intrinsic::cudafor,only:cudamemcpyr2over
+use,intrinsic::cudafor,only:cudamemcpyr4over
+use,intrinsic::cudafor,only:cudamemcpyr8over
+use,intrinsic::cudafor,only:cudamemcpyc4over
+use,intrinsic::cudafor,only:cudamemcpyc8over
+use,intrinsic::cudafor,only:cudamemcpyc1over
+use,intrinsic::cudafor,only:cudamemcpy2di1
+use,intrinsic::cudafor,only:cudamemcpy2di2
+use,intrinsic::cudafor,only:cudamemcpy2di4
+use,intrinsic::cudafor,only:cudamemcpy2di8
+use,intrinsic::cudafor,only:cudamemcpy2dl1
+use,intrinsic::cudafor,only:cudamemcpy2dl2
+use,intrinsic::cudafor,only:cudamemcpy2dl4
+use,intrinsic::cudafor,only:cudamemcpy2dl8
+use,intrinsic::cudafor,only:cudamemcpy2dr2
+use,intrinsic::cudafor,only:cudamemcpy2dr4
+use,intrinsic::cudafor,only:cudamemcpy2dr8
+use,intrinsic::cudafor,only:cudamemcpy2dc4
+use,intrinsic::cudafor,only:cudamemcpy2dc8
+use,intrinsic::cudafor,only:cudamemcpy2dc1
+use,intrinsic::cudafor,only:cudamemcpy2dcdin
+use,intrinsic::cudafor,only:cudamemcpy2dcdout
+use,intrinsic::cudafor,only:cudamemcpy2dcdover
+use,intrinsic::cudafor,only:cudamemcpy2di1in
+use,intrinsic::cudafor,only:cudamemcpy2di2in
+use,intrinsic::cudafor,only:cudamemcpy2di4in
+use,intrinsic::cudafor,only:cudamemcpy2di8in
+use,intrinsic::cudafor,only:cudamemcpy2dl1in
+use,intrinsic::cudafor,only:cudamemcpy2dl2in
+use,intrinsic::cudafor,only:cudamemcpy2dl4in
+use,intrinsic::cudafor,only:cudamemcpy2dl8in
+use,intrinsic::cudafor,only:cudamemcpy2dr2in
+use,intrinsic::cudafor,only:cudamemcpy2dr4in
+use,intrinsic::cudafor,only:cudamemcpy2dr8in
+use,intrinsic::cudafor,only:cudamemcpy2dc4in
+use,intrinsic::cudafor,only:cudamemcpy2dc8in
+use,intrinsic::cudafor,only:cudamemcpy2dc1in
+use,intrinsic::cudafor,only:cudamemcpy2di1out
+use,intrinsic::cudafor,only:cudamemcpy2di2out
+use,intrinsic::cudafor,only:cudamemcpy2di4out
+use,intrinsic::cudafor,only:cudamemcpy2di8out
+use,intrinsic::cudafor,only:cudamemcpy2dl1out
+use,intrinsic::cudafor,only:cudamemcpy2dl2out
+use,intrinsic::cudafor,only:cudamemcpy2dl4out
+use,intrinsic::cudafor,only:cudamemcpy2dl8out
+use,intrinsic::cudafor,only:cudamemcpy2dr2out
+use,intrinsic::cudafor,only:cudamemcpy2dr4out
+use,intrinsic::cudafor,only:cudamemcpy2dr8out
+use,intrinsic::cudafor,only:cudamemcpy2dc4out
+use,intrinsic::cudafor,only:cudamemcpy2dc8out
+use,intrinsic::cudafor,only:cudamemcpy2dc1out
+use,intrinsic::cudafor,only:cudamemcpy2di1over
+use,intrinsic::cudafor,only:cudamemcpy2di2over
+use,intrinsic::cudafor,only:cudamemcpy2di4over
+use,intrinsic::cudafor,only:cudamemcpy2di8over
+use,intrinsic::cudafor,only:cudamemcpy2dl1over
+use,intrinsic::cudafor,only:cudamemcpy2dl2over
+use,intrinsic::cudafor,only:cudamemcpy2dl4over
+use,intrinsic::cudafor,only:cudamemcpy2dl8over
+use,intrinsic::cudafor,only:cudamemcpy2dr2over
+use,intrinsic::cudafor,only:cudamemcpy2dr4over
+use,intrinsic::cudafor,only:cudamemcpy2dr8over
+use,intrinsic::cudafor,only:cudamemcpy2dc4over
+use,intrinsic::cudafor,only:cudamemcpy2dc8over
+use,intrinsic::cudafor,only:cudamemcpy2dc1over
+use,intrinsic::cudafor,only:cudamallocarray3
+use,intrinsic::cudafor,only:cudamallocarray5
+use,intrinsic::cudafor,only:cudafreearray
+use,intrinsic::cudafor,only:cudacreatechanneldesc
+use,intrinsic::cudafor,only:cudagetchanneldesc
+use,intrinsic::cudafor,only:cudamemcpytoarrayi1
+use,intrinsic::cudafor,only:cudamemcpytoarrayi2
+use,intrinsic::cudafor,only:cudamemcpytoarrayi4
+use,intrinsic::cudafor,only:cudamemcpytoarrayi8
+use,intrinsic::cudafor,only:cudamemcpytoarrayl1
+use,intrinsic::cudafor,only:cudamemcpytoarrayl2
+use,intrinsic::cudafor,only:cudamemcpytoarrayl4
+use,intrinsic::cudafor,only:cudamemcpytoarrayl8
+use,intrinsic::cudafor,only:cudamemcpytoarrayr4
+use,intrinsic::cudafor,only:cudamemcpytoarrayr8
+use,intrinsic::cudafor,only:cudamemcpytoarrayc4
+use,intrinsic::cudafor,only:cudamemcpytoarrayc8
+use,intrinsic::cudafor,only:cudamemcpytoarrayc1
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayi1
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayi2
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayi4
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayi8
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayl1
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayl2
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayl4
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayl8
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayr4
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayr8
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayc4
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayc8
+use,intrinsic::cudafor,only:cudamemcpy2dtoarrayc1
+use,intrinsic::cudafor,only:cudamemcpyfromarrayi1
+use,intrinsic::cudafor,only:cudamemcpyfromarrayi2
+use,intrinsic::cudafor,only:cudamemcpyfromarrayi4
+use,intrinsic::cudafor,only:cudamemcpyfromarrayi8
+use,intrinsic::cudafor,only:cudamemcpyfromarrayl1
+use,intrinsic::cudafor,only:cudamemcpyfromarrayl2
+use,intrinsic::cudafor,only:cudamemcpyfromarrayl4
+use,intrinsic::cudafor,only:cudamemcpyfromarrayl8
+use,intrinsic::cudafor,only:cudamemcpyfromarrayr4
+use,intrinsic::cudafor,only:cudamemcpyfromarrayr8
+use,intrinsic::cudafor,only:cudamemcpyfromarrayc4
+use,intrinsic::cudafor,only:cudamemcpyfromarrayc8
+use,intrinsic::cudafor,only:cudamemcpyfromarrayc1
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayi1
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayi2
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayi4
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayi8
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayl1
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayl2
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayl4
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayl8
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayr4
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayr8
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayc4
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayc8
+use,intrinsic::cudafor,only:cudamemcpy2dfromarrayc1
+use,intrinsic::cudafor,only:cudamemcpyarraytoarray
+use,intrinsic::cudafor,only:cudamemcpy2darraytoarray
+use,intrinsic::cudafor,only:cudamalloc3dcdevptr
+use,intrinsic::cudafor,only:cudamalloc3di1
+use,intrinsic::cudafor,only:cudamalloc3di2
+use,intrinsic::cudafor,only:cudamalloc3di4
+use,intrinsic::cudafor,only:cudamalloc3di8
+use,intrinsic::cudafor,only:cudamalloc3dl1
+use,intrinsic::cudafor,only:cudamalloc3dl2
+use,intrinsic::cudafor,only:cudamalloc3dl4
+use,intrinsic::cudafor,only:cudamalloc3dl8
+use,intrinsic::cudafor,only:cudamalloc3dr2
+use,intrinsic::cudafor,only:cudamalloc3dr4
+use,intrinsic::cudafor,only:cudamalloc3dr8
+use,intrinsic::cudafor,only:cudamalloc3dc4
+use,intrinsic::cudafor,only:cudamalloc3dc8
+use,intrinsic::cudafor,only:cudamalloc3dc1
+use,intrinsic::cudafor,only:cudamalloc3darray
+use,intrinsic::cudafor,only:cudamemset3d
+use,intrinsic::cudafor,only:cudamemcpy3d
+use,intrinsic::cudafor,only:cudamemcpy3dasync
+use,intrinsic::cudafor,only:cudamemcpy3dpeer
+use,intrinsic::cudafor,only:cudamemcpy3dpeerasync
+use,intrinsic::cudafor,only:cudamemcpytosymboli1
+use,intrinsic::cudafor,only:cudamemcpytosymboli2
+use,intrinsic::cudafor,only:cudamemcpytosymboli4
+use,intrinsic::cudafor,only:cudamemcpytosymboli8
+use,intrinsic::cudafor,only:cudamemcpytosymboll1
+use,intrinsic::cudafor,only:cudamemcpytosymboll2
+use,intrinsic::cudafor,only:cudamemcpytosymboll4
+use,intrinsic::cudafor,only:cudamemcpytosymboll8
+use,intrinsic::cudafor,only:cudamemcpytosymbolr2
+use,intrinsic::cudafor,only:cudamemcpytosymbolr4
+use,intrinsic::cudafor,only:cudamemcpytosymbolr8
+use,intrinsic::cudafor,only:cudamemcpytosymbolc4
+use,intrinsic::cudafor,only:cudamemcpytosymbolc8
+use,intrinsic::cudafor,only:cudamemcpytosymbolc1
+use,intrinsic::cudafor,only:cudamemcpytosymbolcp
+use,intrinsic::cudafor,only:cudamemcpyfromsymboli1
+use,intrinsic::cudafor,only:cudamemcpyfromsymboli2
+use,intrinsic::cudafor,only:cudamemcpyfromsymboli4
+use,intrinsic::cudafor,only:cudamemcpyfromsymboli8
+use,intrinsic::cudafor,only:cudamemcpyfromsymboll1
+use,intrinsic::cudafor,only:cudamemcpyfromsymboll2
+use,intrinsic::cudafor,only:cudamemcpyfromsymboll4
+use,intrinsic::cudafor,only:cudamemcpyfromsymboll8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolr2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolr4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolr8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolc4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolc8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolc1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolcp
+use,intrinsic::cudafor,only:cudagetsymboladdress
+use,intrinsic::cudafor,only:cudagetsymbolsize
+use,intrinsic::cudafor,only:cudamemcpytosymbolasynci1
+use,intrinsic::cudafor,only:cudamemcpytosymbolasynci2
+use,intrinsic::cudafor,only:cudamemcpytosymbolasynci4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasynci8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncl1
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncl2
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncl4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncl8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncr2
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncr4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncr8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncc4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncc8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasyncc1
+use,intrinsic::cudafor,only:cudamemcpytosymbolasynccp
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5i1
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5i2
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5i4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5i8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5l1
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5l2
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5l4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5l8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5r2
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5r4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5r8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5c4
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5c8
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5c1
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync5cp
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasynci1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasynci2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasynci4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasynci8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncl1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncl2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncl4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncl8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncr2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncr4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncr8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncc4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncc8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasyncc1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasynccp
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5i1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5i2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5i4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5i8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5l1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5l2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5l4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5l8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5r2
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5r4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5r8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5c4
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5c8
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5c1
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync5cp
+use,intrinsic::cudafor,only:cudaeventcreate
+use,intrinsic::cudafor,only:cudaeventrecord
+use,intrinsic::cudafor,only:cudaeventquery
+use,intrinsic::cudafor,only:cudaeventsynchronize
+use,intrinsic::cudafor,only:cudaeventdestroy
+use,intrinsic::cudafor,only:cudaeventelapsedtime
+use,intrinsic::cudafor,only:cudaeventcreatewithflags
+use,intrinsic::cudafor,only:cudamemcpyasynci1
+use,intrinsic::cudafor,only:cudamemcpyasynci2
+use,intrinsic::cudafor,only:cudamemcpyasynci4
+use,intrinsic::cudafor,only:cudamemcpyasynci8
+use,intrinsic::cudafor,only:cudamemcpyasyncl1
+use,intrinsic::cudafor,only:cudamemcpyasyncl2
+use,intrinsic::cudafor,only:cudamemcpyasyncl4
+use,intrinsic::cudafor,only:cudamemcpyasyncl8
+use,intrinsic::cudafor,only:cudamemcpyasyncr2
+use,intrinsic::cudafor,only:cudamemcpyasyncr4
+use,intrinsic::cudafor,only:cudamemcpyasyncr8
+use,intrinsic::cudafor,only:cudamemcpyasyncc4
+use,intrinsic::cudafor,only:cudamemcpyasyncc8
+use,intrinsic::cudafor,only:cudamemcpyasyncc1
+use,intrinsic::cudafor,only:cudamemcpyasynccdin
+use,intrinsic::cudafor,only:cudamemcpyasynccdout
+use,intrinsic::cudafor,only:cudamemcpyasynccdover
+use,intrinsic::cudafor,only:cudamemcpyasynci1in
+use,intrinsic::cudafor,only:cudamemcpyasynci2in
+use,intrinsic::cudafor,only:cudamemcpyasynci4in
+use,intrinsic::cudafor,only:cudamemcpyasynci8in
+use,intrinsic::cudafor,only:cudamemcpyasyncl1in
+use,intrinsic::cudafor,only:cudamemcpyasyncl2in
+use,intrinsic::cudafor,only:cudamemcpyasyncl4in
+use,intrinsic::cudafor,only:cudamemcpyasyncl8in
+use,intrinsic::cudafor,only:cudamemcpyasyncr2in
+use,intrinsic::cudafor,only:cudamemcpyasyncr4in
+use,intrinsic::cudafor,only:cudamemcpyasyncr8in
+use,intrinsic::cudafor,only:cudamemcpyasyncc4in
+use,intrinsic::cudafor,only:cudamemcpyasyncc8in
+use,intrinsic::cudafor,only:cudamemcpyasyncc1in
+use,intrinsic::cudafor,only:cudamemcpyasynci1out
+use,intrinsic::cudafor,only:cudamemcpyasynci2out
+use,intrinsic::cudafor,only:cudamemcpyasynci4out
+use,intrinsic::cudafor,only:cudamemcpyasynci8out
+use,intrinsic::cudafor,only:cudamemcpyasyncl1out
+use,intrinsic::cudafor,only:cudamemcpyasyncl2out
+use,intrinsic::cudafor,only:cudamemcpyasyncl4out
+use,intrinsic::cudafor,only:cudamemcpyasyncl8out
+use,intrinsic::cudafor,only:cudamemcpyasyncr2out
+use,intrinsic::cudafor,only:cudamemcpyasyncr4out
+use,intrinsic::cudafor,only:cudamemcpyasyncr8out
+use,intrinsic::cudafor,only:cudamemcpyasyncc4out
+use,intrinsic::cudafor,only:cudamemcpyasyncc8out
+use,intrinsic::cudafor,only:cudamemcpyasyncc1out
+use,intrinsic::cudafor,only:cudamemcpyasynci1over
+use,intrinsic::cudafor,only:cudamemcpyasynci2over
+use,intrinsic::cudafor,only:cudamemcpyasynci4over
+use,intrinsic::cudafor,only:cudamemcpyasynci8over
+use,intrinsic::cudafor,only:cudamemcpyasyncl1over
+use,intrinsic::cudafor,only:cudamemcpyasyncl2over
+use,intrinsic::cudafor,only:cudamemcpyasyncl4over
+use,intrinsic::cudafor,only:cudamemcpyasyncl8over
+use,intrinsic::cudafor,only:cudamemcpyasyncr2over
+use,intrinsic::cudafor,only:cudamemcpyasyncr4over
+use,intrinsic::cudafor,only:cudamemcpyasyncr8over
+use,intrinsic::cudafor,only:cudamemcpyasyncc4over
+use,intrinsic::cudafor,only:cudamemcpyasyncc8over
+use,intrinsic::cudafor,only:cudamemcpyasyncc1over
+use,intrinsic::cudafor,only:cudamemcpyasynccdin4
+use,intrinsic::cudafor,only:cudamemcpyasynccdout4
+use,intrinsic::cudafor,only:cudamemcpyasynccdover4
+use,intrinsic::cudafor,only:cudamemcpyasync3i1in
+use,intrinsic::cudafor,only:cudamemcpyasync3i2in
+use,intrinsic::cudafor,only:cudamemcpyasync3i4in
+use,intrinsic::cudafor,only:cudamemcpyasync3i8in
+use,intrinsic::cudafor,only:cudamemcpyasync3l1in
+use,intrinsic::cudafor,only:cudamemcpyasync3l2in
+use,intrinsic::cudafor,only:cudamemcpyasync3l4in
+use,intrinsic::cudafor,only:cudamemcpyasync3l8in
+use,intrinsic::cudafor,only:cudamemcpyasync3r2in
+use,intrinsic::cudafor,only:cudamemcpyasync3r4in
+use,intrinsic::cudafor,only:cudamemcpyasync3r8in
+use,intrinsic::cudafor,only:cudamemcpyasync3c4in
+use,intrinsic::cudafor,only:cudamemcpyasync3c8in
+use,intrinsic::cudafor,only:cudamemcpyasync3c1in
+use,intrinsic::cudafor,only:cudamemcpyasync3i1out
+use,intrinsic::cudafor,only:cudamemcpyasync3i2out
+use,intrinsic::cudafor,only:cudamemcpyasync3i4out
+use,intrinsic::cudafor,only:cudamemcpyasync3i8out
+use,intrinsic::cudafor,only:cudamemcpyasync3l1out
+use,intrinsic::cudafor,only:cudamemcpyasync3l2out
+use,intrinsic::cudafor,only:cudamemcpyasync3l4out
+use,intrinsic::cudafor,only:cudamemcpyasync3l8out
+use,intrinsic::cudafor,only:cudamemcpyasync3r2out
+use,intrinsic::cudafor,only:cudamemcpyasync3r4out
+use,intrinsic::cudafor,only:cudamemcpyasync3r8out
+use,intrinsic::cudafor,only:cudamemcpyasync3c4out
+use,intrinsic::cudafor,only:cudamemcpyasync3c8out
+use,intrinsic::cudafor,only:cudamemcpyasync3c1out
+use,intrinsic::cudafor,only:cudamemcpyasync3i1over
+use,intrinsic::cudafor,only:cudamemcpyasync3i2over
+use,intrinsic::cudafor,only:cudamemcpyasync3i4over
+use,intrinsic::cudafor,only:cudamemcpyasync3i8over
+use,intrinsic::cudafor,only:cudamemcpyasync3l1over
+use,intrinsic::cudafor,only:cudamemcpyasync3l2over
+use,intrinsic::cudafor,only:cudamemcpyasync3l4over
+use,intrinsic::cudafor,only:cudamemcpyasync3l8over
+use,intrinsic::cudafor,only:cudamemcpyasync3r2over
+use,intrinsic::cudafor,only:cudamemcpyasync3r4over
+use,intrinsic::cudafor,only:cudamemcpyasync3r8over
+use,intrinsic::cudafor,only:cudamemcpyasync3c4over
+use,intrinsic::cudafor,only:cudamemcpyasync3c8over
+use,intrinsic::cudafor,only:cudamemcpyasync3c1over
+use,intrinsic::cudafor,only:cudamemcpyasync3cdin4
+use,intrinsic::cudafor,only:cudamemcpyasync3cdout4
+use,intrinsic::cudafor,only:cudamemcpyasync3cdover4
+use,intrinsic::cudafor,only:cudamemcpy2dasynccdin
+use,intrinsic::cudafor,only:cudamemcpy2dasynccdout
+use,intrinsic::cudafor,only:cudamemcpy2dasynccdover
+use,intrinsic::cudafor,only:cudamemcpy2dasynci1in
+use,intrinsic::cudafor,only:cudamemcpy2dasynci2in
+use,intrinsic::cudafor,only:cudamemcpy2dasynci4in
+use,intrinsic::cudafor,only:cudamemcpy2dasynci8in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl1in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl2in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl4in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl8in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr2in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr4in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr8in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc4in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc8in
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc1in
+use,intrinsic::cudafor,only:cudamemcpy2dasynci1out
+use,intrinsic::cudafor,only:cudamemcpy2dasynci2out
+use,intrinsic::cudafor,only:cudamemcpy2dasynci4out
+use,intrinsic::cudafor,only:cudamemcpy2dasynci8out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl1out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl2out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl4out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl8out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr2out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr4out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr8out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc4out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc8out
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc1out
+use,intrinsic::cudafor,only:cudamemcpy2dasynci1over
+use,intrinsic::cudafor,only:cudamemcpy2dasynci2over
+use,intrinsic::cudafor,only:cudamemcpy2dasynci4over
+use,intrinsic::cudafor,only:cudamemcpy2dasynci8over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl1over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl2over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl4over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncl8over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr2over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr4over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncr8over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc4over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc8over
+use,intrinsic::cudafor,only:cudamemcpy2dasyncc1over
+use,intrinsic::cudafor,only:cudamemcpypeeri1
+use,intrinsic::cudafor,only:cudamemcpypeeri2
+use,intrinsic::cudafor,only:cudamemcpypeeri4
+use,intrinsic::cudafor,only:cudamemcpypeeri8
+use,intrinsic::cudafor,only:cudamemcpypeerl1
+use,intrinsic::cudafor,only:cudamemcpypeerl2
+use,intrinsic::cudafor,only:cudamemcpypeerl4
+use,intrinsic::cudafor,only:cudamemcpypeerl8
+use,intrinsic::cudafor,only:cudamemcpypeerr2
+use,intrinsic::cudafor,only:cudamemcpypeerr4
+use,intrinsic::cudafor,only:cudamemcpypeerr8
+use,intrinsic::cudafor,only:cudamemcpypeerc4
+use,intrinsic::cudafor,only:cudamemcpypeerc8
+use,intrinsic::cudafor,only:cudamemcpypeerc1
+use,intrinsic::cudafor,only:cudamemcpypeercd
+use,intrinsic::cudafor,only:cudamemcpypeerasynci1
+use,intrinsic::cudafor,only:cudamemcpypeerasynci2
+use,intrinsic::cudafor,only:cudamemcpypeerasynci4
+use,intrinsic::cudafor,only:cudamemcpypeerasynci8
+use,intrinsic::cudafor,only:cudamemcpypeerasyncl1
+use,intrinsic::cudafor,only:cudamemcpypeerasyncl2
+use,intrinsic::cudafor,only:cudamemcpypeerasyncl4
+use,intrinsic::cudafor,only:cudamemcpypeerasyncl8
+use,intrinsic::cudafor,only:cudamemcpypeerasyncr2
+use,intrinsic::cudafor,only:cudamemcpypeerasyncr4
+use,intrinsic::cudafor,only:cudamemcpypeerasyncr8
+use,intrinsic::cudafor,only:cudamemcpypeerasyncc4
+use,intrinsic::cudafor,only:cudamemcpypeerasyncc8
+use,intrinsic::cudafor,only:cudamemcpypeerasyncc1
+use,intrinsic::cudafor,only:cudamemcpypeerasynccd
+use,intrinsic::cudafor,only:cudageterrorstring
+use,intrinsic::cudafor,only:cudagetlasterror
+use,intrinsic::cudafor,only:cudapeekatlasterror
+use,intrinsic::cudafor,only:cudadrivergetversion
+use,intrinsic::cudafor,only:cudaruntimegetversion
+use,intrinsic::cudafor,only:cudamemgetinfo
+use,intrinsic::cudafor,only:cudafuncsetcacheconfigfunc
+use,intrinsic::cudafor,only:cudafuncsetsharedmemconfigfunc
+use,intrinsic::cudafor,only:cudafuncgetattributesfunc
+use,intrinsic::cudafor,only:cudafuncsetattribute
+use,intrinsic::cudafor,only:cudapointergetattributesi
+use,intrinsic::cudafor,only:cudapointergetattributesl
+use,intrinsic::cudafor,only:cudapointergetattributesr
+use,intrinsic::cudafor,only:cudapointergetattributesc
+use,intrinsic::cudafor,only:cudapointergetattributesc1
+use,intrinsic::cudafor,only:cudapointergetattributeshp
+use,intrinsic::cudafor,only:cudapointergetattributesdp
+use,intrinsic::cudafor,only:cudasetdoublefordevice
+use,intrinsic::cudafor,only:cudasetdoubleforhost
+use,intrinsic::cudafor,only:cudadevicecanaccesspeeri
+use,intrinsic::cudafor,only:cudadevicecanaccesspeerl
+use,intrinsic::cudafor,only:cudadevicedisablepeeraccess
+use,intrinsic::cudafor,only:cudadeviceenablepeeraccess
+use,intrinsic::cudafor,only:cudadevicegetp2pattribute
+use,intrinsic::cudafor,only:cudaoccupancymaxactiveblockspermultiprocessor
+use,intrinsic::cudafor,only:cudaoccupancymaxactiveblockspermultiprocessorwithflags
+use,intrinsic::cudafor,only:cudaoccupancymaxpotentialclustersize
+use,intrinsic::cudafor,only:cudaoccupancymaxactiveclusters
+use,intrinsic::cudafor,only:cudaprofilerstart
+use,intrinsic::cudafor,only:cudaprofilerstop
+use,intrinsic::cudafor,only:cudagraphcreate
+use,intrinsic::cudafor,only:cudagraphinstantiate_11xxx
+use,intrinsic::cudafor,only:cudagraphinstantiate_12000
+use,intrinsic::cudafor,only:cudagraphlaunch
+use,intrinsic::cudafor,only:cudagraphexecdestroy
+use,intrinsic::cudafor,only:cudagraphdestroy
+use,intrinsic::cudafor,only:cudalaunchhostfunc
+use,intrinsic::cudafor,only:cudachoosedevice
+use,intrinsic::cudafor,only:cudagetdeviceproperties
+use,intrinsic::cudafor,only:cudastreamcreate
+use,intrinsic::cudafor,only:cudastreamsynchronize
+use,intrinsic::cudafor,only:cudastreamattachmemasync
+use,intrinsic::cudafor,only:cudaforsetdefaultstream
+use,intrinsic::cudafor,only:cudaforgetdefaultstream
+use,intrinsic::cudafor,only:cudamalloc
+use,intrinsic::cudafor,only:cudamallocmanaged
+use,intrinsic::cudafor,only:cudamallocasync
+use,intrinsic::cudafor,only:cudamemadvise
+use,intrinsic::cudafor,only:cudamemadvise_v2
+use,intrinsic::cudafor,only:cudamemprefetchasync
+use,intrinsic::cudafor,only:cudamemprefetchasync_v2
+use,intrinsic::cudafor,only:cudamemrangegetattribute
+use,intrinsic::cudafor,only:cudafree
+use,intrinsic::cudafor,only:cudafreeasync
+use,intrinsic::cudafor,only:cudamallocpitch
+use,intrinsic::cudafor,only:cudamemset
+use,intrinsic::cudafor,only:cudamemsetapi
+use,intrinsic::cudafor,only:cudamemsetasync
+use,intrinsic::cudafor,only:cudamemsetasyncapi
+use,intrinsic::cudafor,only:cudamemset2d
+use,intrinsic::cudafor,only:cudamemset2dasync
+use,intrinsic::cudafor,only:cudamemcpy
+use,intrinsic::cudafor,only:cudamemcpy2d
+use,intrinsic::cudafor,only:cudamallocarray
+use,intrinsic::cudafor,only:cudamemcpytoarray
+use,intrinsic::cudafor,only:cudamemcpy2dtoarray
+use,intrinsic::cudafor,only:cudamemcpyfromarray
+use,intrinsic::cudafor,only:cudamemcpy2dfromarray
+use,intrinsic::cudafor,only:cudamalloc3d
+use,intrinsic::cudafor,only:cudamemcpytosymbol
+use,intrinsic::cudafor,only:cudamemcpyfromsymbol
+use,intrinsic::cudafor,only:cudamemcpytosymbolasync
+use,intrinsic::cudafor,only:cudamemcpyfromsymbolasync
+use,intrinsic::cudafor,only:cudamemcpyasync
+use,intrinsic::cudafor,only:cudamemcpy2dasync
+use,intrinsic::cudafor,only:cudamemcpypeer
+use,intrinsic::cudafor,only:cudamemcpypeerasync
+use,intrinsic::cudafor,only:cudafuncsetcacheconfig
+use,intrinsic::cudafor,only:cudafuncsetsharedmemconfig
+use,intrinsic::cudafor,only:cudafuncgetattributes
+use,intrinsic::cudafor,only:cudapointergetattributes
+use,intrinsic::cudafor,only:cudadevicecanaccesspeer
+use,intrinsic::cudafor,only:cudagraphinstantiate
+use,intrinsic::cudafor,only:cutensoralgo
+use,intrinsic::cudafor,only:cutensor_algo_default_patient
+use,intrinsic::cudafor,only:cutensor_algo_gett
+use,intrinsic::cudafor,only:cutensor_algo_tgett
+use,intrinsic::cudafor,only:cutensor_algo_ttgt
+use,intrinsic::cudafor,only:cutensor_algo_default
+use,intrinsic::cudafor,only:cutensorworksizepreference
+use,intrinsic::cudafor,only:cutensor_workspace_min
+use,intrinsic::cudafor,only:cutensor_workspace_default
+use,intrinsic::cudafor,only:cutensor_workspace_max
+use,intrinsic::cudafor,only:cutensoroperator
+use,intrinsic::cudafor,only:cutensor_op_identity
+use,intrinsic::cudafor,only:cutensor_op_sqrt
+use,intrinsic::cudafor,only:cutensor_op_relu
+use,intrinsic::cudafor,only:cutensor_op_conj
+use,intrinsic::cudafor,only:cutensor_op_rcp
+use,intrinsic::cudafor,only:cutensor_op_sigmoid
+use,intrinsic::cudafor,only:cutensor_op_tanh
+use,intrinsic::cudafor,only:cutensor_op_exp
+use,intrinsic::cudafor,only:cutensor_op_log
+use,intrinsic::cudafor,only:cutensor_op_abs
+use,intrinsic::cudafor,only:cutensor_op_neg
+use,intrinsic::cudafor,only:cutensor_op_sin
+use,intrinsic::cudafor,only:cutensor_op_cos
+use,intrinsic::cudafor,only:cutensor_op_tan
+use,intrinsic::cudafor,only:cutensor_op_sinh
+use,intrinsic::cudafor,only:cutensor_op_cosh
+use,intrinsic::cudafor,only:cutensor_op_asin
+use,intrinsic::cudafor,only:cutensor_op_acos
+use,intrinsic::cudafor,only:cutensor_op_atan
+use,intrinsic::cudafor,only:cutensor_op_asinh
+use,intrinsic::cudafor,only:cutensor_op_acosh
+use,intrinsic::cudafor,only:cutensor_op_atanh
+use,intrinsic::cudafor,only:cutensor_op_ceil
+use,intrinsic::cudafor,only:cutensor_op_floor
+use,intrinsic::cudafor,only:cutensor_op_mish
+use,intrinsic::cudafor,only:cutensor_op_swish
+use,intrinsic::cudafor,only:cutensor_op_soft_plus
+use,intrinsic::cudafor,only:cutensor_op_soft_sign
+use,intrinsic::cudafor,only:cutensor_op_add
+use,intrinsic::cudafor,only:cutensor_op_mul
+use,intrinsic::cudafor,only:cutensor_op_max
+use,intrinsic::cudafor,only:cutensor_op_min
+use,intrinsic::cudafor,only:cutensor_op_unknown
+use,intrinsic::cudafor,only:cutensorstatus
+use,intrinsic::cudafor,only:cutensor_status_success
+use,intrinsic::cudafor,only:cutensor_status_not_initialized
+use,intrinsic::cudafor,only:cutensor_status_alloc_failed
+use,intrinsic::cudafor,only:cutensor_status_invalid_value
+use,intrinsic::cudafor,only:cutensor_status_arch_mismatch
+use,intrinsic::cudafor,only:cutensor_status_mapping_error
+use,intrinsic::cudafor,only:cutensor_status_execution_failed
+use,intrinsic::cudafor,only:cutensor_status_internal_error
+use,intrinsic::cudafor,only:cutensor_status_not_supported
+use,intrinsic::cudafor,only:cutensor_status_license_error
+use,intrinsic::cudafor,only:cutensor_status_cublas_error
+use,intrinsic::cudafor,only:cutensor_status_cuda_error
+use,intrinsic::cudafor,only:cutensor_status_insufficient_workspace
+use,intrinsic::cudafor,only:cutensor_status_insufficient_driver
+use,intrinsic::cudafor,only:cutensor_status_io_error
+use,intrinsic::cudafor,only:cutensordatatype
+use,intrinsic::cudafor,only:cutensor_r_16f
+use,intrinsic::cudafor,only:cutensor_c_16f
+use,intrinsic::cudafor,only:cutensor_r_16bf
+use,intrinsic::cudafor,only:cutensor_c_16bf
+use,intrinsic::cudafor,only:cutensor_r_32f
+use,intrinsic::cudafor,only:cutensor_c_32f
+use,intrinsic::cudafor,only:cutensor_r_64f
+use,intrinsic::cudafor,only:cutensor_c_64f
+use,intrinsic::cudafor,only:cutensor_r_4i
+use,intrinsic::cudafor,only:cutensor_c_4i
+use,intrinsic::cudafor,only:cutensor_r_4u
+use,intrinsic::cudafor,only:cutensor_c_4u
+use,intrinsic::cudafor,only:cutensor_r_8i
+use,intrinsic::cudafor,only:cutensor_c_8i
+use,intrinsic::cudafor,only:cutensor_r_8u
+use,intrinsic::cudafor,only:cutensor_c_8u
+use,intrinsic::cudafor,only:cutensor_r_16i
+use,intrinsic::cudafor,only:cutensor_c_16i
+use,intrinsic::cudafor,only:cutensor_r_16u
+use,intrinsic::cudafor,only:cutensor_c_16u
+use,intrinsic::cudafor,only:cutensor_r_32i
+use,intrinsic::cudafor,only:cutensor_c_32i
+use,intrinsic::cudafor,only:cutensor_r_32u
+use,intrinsic::cudafor,only:cutensor_c_32u
+use,intrinsic::cudafor,only:cutensor_r_64i
+use,intrinsic::cudafor,only:cutensor_c_64i
+use,intrinsic::cudafor,only:cutensor_r_64u
+use,intrinsic::cudafor,only:cutensor_c_64u
+use,intrinsic::cudafor,only:cutensorcomputetype
+use,intrinsic::cudafor,only:cutensor_compute_16f
+use,intrinsic::cudafor,only:cutensor_compute_16bf
+use,intrinsic::cudafor,only:cutensor_compute_tf32
+use,intrinsic::cudafor,only:cutensor_compute_3xtf32
+use,intrinsic::cudafor,only:cutensor_compute_32f
+use,intrinsic::cudafor,only:cutensor_compute_64f
+use,intrinsic::cudafor,only:cutensor_compute_8u
+use,intrinsic::cudafor,only:cutensor_compute_8i
+use,intrinsic::cudafor,only:cutensor_compute_32u
+use,intrinsic::cudafor,only:cutensor_compute_32i
+use,intrinsic::cudafor,only:cutensoroperationdescriptorattribute
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_tag
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_scalar_type
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_flops
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_moved_bytes
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_padding_left
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_padding_right
+use,intrinsic::cudafor,only:cutensor_operation_descriptor_padding_value
+use,intrinsic::cudafor,only:cutensorplanpreferenceattribute
+use,intrinsic::cudafor,only:cutensor_plan_preference_autotune_mode
+use,intrinsic::cudafor,only:cutensor_plan_preference_cache_mode
+use,intrinsic::cudafor,only:cutensor_plan_preference_incremental_count
+use,intrinsic::cudafor,only:cutensor_plan_preference_algo
+use,intrinsic::cudafor,only:cutensor_plan_preference_kernel_rank
+use,intrinsic::cudafor,only:cutensor_plan_preference_jit
+use,intrinsic::cudafor,only:cutensorplanattribute
+use,intrinsic::cudafor,only:cutensor_plan_required_workspace
+use,intrinsic::cudafor,only:cutensorautotunemode
+use,intrinsic::cudafor,only:cutensor_autotune_mode_none
+use,intrinsic::cudafor,only:cutensor_autotune_mode_incremental
+use,intrinsic::cudafor,only:cutensorjitmode
+use,intrinsic::cudafor,only:cutensor_jit_mode_none
+use,intrinsic::cudafor,only:cutensor_jit_mode_default
+use,intrinsic::cudafor,only:cutensorcachemode
+use,intrinsic::cudafor,only:cutensor_cache_mode_none
+use,intrinsic::cudafor,only:cutensor_cache_mode_pedantic
+use,intrinsic::cudafor,only:cutensorhandle
+use,intrinsic::cudafor,only:cutensortensordescriptor
+use,intrinsic::cudafor,only:cutensoroperationdescriptor
+use,intrinsic::cudafor,only:cutensorcomputedescriptor
+use,intrinsic::cudafor,only:cutensorplan
+use,intrinsic::cudafor,only:cutensorplanpreference
+use,intrinsic::cudafor,only:cutensor_major
+use,intrinsic::cudafor,only:cutensor_minor
+use,intrinsic::cudafor,only:cutensor_patch
+use,intrinsic::cudafor,only:cutensor_compute_desc_16f
+use,intrinsic::cudafor,only:cutensor_compute_desc_16bf
+use,intrinsic::cudafor,only:cutensor_compute_desc_tf32
+use,intrinsic::cudafor,only:cutensor_compute_desc_3xtf32
+use,intrinsic::cudafor,only:cutensor_compute_desc_32f
+use,intrinsic::cudafor,only:cutensor_compute_desc_64f
+use,intrinsic::cudafor,only:cutensorcreate
+use,intrinsic::cudafor,only:cutensordestroy
+use,intrinsic::cudafor,only:cutensorcreatetensordescriptor
+use,intrinsic::cudafor,only:cutensordestroytensordescriptor
+use,intrinsic::cudafor,only:cutensorcreateelementwisetrinary
+use,intrinsic::cudafor,only:cutensorelementwisetrinaryexecute
+use,intrinsic::cudafor,only:cutensorcreateelementwisebinary
+use,intrinsic::cudafor,only:cutensorelementwisebinaryexecute
+use,intrinsic::cudafor,only:cutensorcreatepermutation
+use,intrinsic::cudafor,only:cutensorpermute
+use,intrinsic::cudafor,only:cutensoroperationdescriptorsetattribute
+use,intrinsic::cudafor,only:cutensoroperationdescriptorgetattribute
+use,intrinsic::cudafor,only:cutensordestroyoperationdescriptor
+use,intrinsic::cudafor,only:cutensorcreateplanpreference
+use,intrinsic::cudafor,only:cutensordestroyplanpreference
+use,intrinsic::cudafor,only:cutensorplanpreferencesetattribute
+use,intrinsic::cudafor,only:cutensorplangetattribute
+use,intrinsic::cudafor,only:cutensorestimateworkspacesize
+use,intrinsic::cudafor,only:cutensorcreateplan
+use,intrinsic::cudafor,only:cutensordestroyplan
+use,intrinsic::cudafor,only:cutensorcreatecontraction
+use,intrinsic::cudafor,only:cutensorcontract
+use,intrinsic::cudafor,only:cutensorcreatereduction
+use,intrinsic::cudafor,only:cutensorreduce
+use,intrinsic::cudafor,only:cutensorgeterrorstring
+use,intrinsic::cudafor,only:cutensorgetversion
+use,intrinsic::cudafor,only:cutensorgetcudartversion
+use,intrinsic::cudafor,only:cutensorloggersetfile
+use,intrinsic::cudafor,only:cutensorloggeropenfile
+use,intrinsic::cudafor,only:cutensorloggersetlevel
+use,intrinsic::cudafor,only:cutensorloggersetmask
+use,intrinsic::cudafor,only:cutensorloggerforcedisable
+use,intrinsic::cudafor,only:__nvf_cutensorcompareeqstatus
+use,intrinsic::cudafor,only:__nvf_cutensorcompareeqoperator
+use,intrinsic::cudafor,only:__nvf_cutensorcompareeqdatatype
+use,intrinsic::cudafor,only:__nvf_cutensorcomparenestatus
+use,intrinsic::cudafor,only:__nvf_cutensorcompareneoperator
+use,intrinsic::cudafor,only:__nvf_cutensorcomparenedatatype
+use,intrinsic::cudafor,only:deferredarray
+use,intrinsic::cudafor,only:deferredarrayreshapeop
+use,intrinsic::cudafor,only:deferredarrayunaryop1d
+use,intrinsic::cudafor,only:deferredarrayunaryop2d
+use,intrinsic::cudafor,only:deferredarrayunaryop3d
+use,intrinsic::cudafor,only:deferredarrayreshapefuncop
+use,intrinsic::cudafor,only:deferredarraybinaryop1d
+use,intrinsic::cudafor,only:deferredarraybinaryop2d
+use,intrinsic::cudafor,only:deferredarraybinaryop3d
+use,intrinsic::cudafor,only:deferredarraybinaryopnd
+use,intrinsic::cudafor,only:deferredarraybinaryfuncop1d
+use,intrinsic::cudafor,only:deferredarraybinaryfuncop2d
+use,intrinsic::cudafor,only:deferredarraybinaryfuncop3d
+use,intrinsic::cudafor,only:deferredarraybinaryfuncopnd
+use,intrinsic::cudafor,only:deferredarraytrinaryop1d
+use,intrinsic::cudafor,only:deferredarraytrinaryop2d
+use,intrinsic::cudafor,only:deferredarraytrinaryop3d
+use,intrinsic::cudafor,only:deferredarraytrinaryopnd
+use,intrinsic::cudafor,only:deferredarraymatmulop
+use,intrinsic::cudafor,only:deferredarraymatmulaccop
+use,intrinsic::cudafor,only:deferredarraydotprop
+use,intrinsic::cudafor,only:deferredarraydotprdimop
+use,intrinsic::cudafor,only:deferredarraydotprdaccop
+use,intrinsic::cudafor,only:deferredarrayrelationalop1d
+use,intrinsic::cudafor,only:deferredarrayrelationalop2d
+use,intrinsic::cudafor,only:deferredarrayrelationalop3d
+use,intrinsic::cudafor,only:deferredarraybinaryop1drelationalop
+use,intrinsic::cudafor,only:deferredarraybinaryop2drelationalop
+use,intrinsic::cudafor,only:deferredarraybinaryop3drelationalop
+use,intrinsic::cudafor,only:deferredarraymergeop1d
+use,intrinsic::cudafor,only:deferredarraymergeop2d
+use,intrinsic::cudafor,only:deferredarraymergeop3d
+use,intrinsic::cudafor,only:deferredarraybimergeop1d
+use,intrinsic::cudafor,only:deferredarraybimergeop2d
+use,intrinsic::cudafor,only:deferredarraybimergeop3d
+use,intrinsic::cudafor,only:deferredarraypackop1d
+use,intrinsic::cudafor,only:deferredarraybipackop1d
+use,intrinsic::cudafor,only:deferredarraypackidxop1d
+use,intrinsic::cudafor,only:deferredarraypackidxop2d
+use,intrinsic::cudafor,only:deferredarraypackidxop3d
+use,intrinsic::cudafor,only:deferredarraybipackidxop1d
+use,intrinsic::cudafor,only:deferredarraybipackidxop2d
+use,intrinsic::cudafor,only:deferredarraybipackidxop3d
+use,intrinsic::cudafor,only:deferredarrayunpackop1d
+use,intrinsic::cudafor,only:deferredarrayunpackop2d
+use,intrinsic::cudafor,only:deferredarrayunpackop3d
+use,intrinsic::cudafor,only:deferredarraybiunpackop1d
+use,intrinsic::cudafor,only:deferredarraybiunpackop2d
+use,intrinsic::cudafor,only:deferredarraybiunpackop3d
+use,intrinsic::cudafor,only:deferredarraycountprefixop1d
+use,intrinsic::cudafor,only:deferredarraycountprefixop2d
+use,intrinsic::cudafor,only:deferredarraycountprefixop3d
+use,intrinsic::cudafor,only:deferredarraycountprefixbiop1d
+use,intrinsic::cudafor,only:deferredarraycountprefixbiop2d
+use,intrinsic::cudafor,only:deferredarraycountprefixbiop3d
+use,intrinsic::cudafor,only:deferredarraysumprefixnmop1d
+use,intrinsic::cudafor,only:deferredarraysumprefixnmop2d
+use,intrinsic::cudafor,only:deferredarraysumprefixnmop3d
+use,intrinsic::cudafor,only:deferredarraysumprefixop1d
+use,intrinsic::cudafor,only:deferredarraysumprefixop2d
+use,intrinsic::cudafor,only:deferredarraysumprefixop3d
+use,intrinsic::cudafor,only:deferredarraysumprefixbiop1d
+use,intrinsic::cudafor,only:deferredarraysumprefixbiop2d
+use,intrinsic::cudafor,only:deferredarraysumprefixbiop3d
+use,intrinsic::cudafor,only:deferredarraymaskreducop1d
+use,intrinsic::cudafor,only:deferredarraymaskreducop2d
+use,intrinsic::cudafor,only:deferredarraymaskreducop3d
+use,intrinsic::cudafor,only:deferredarraytensorreducop
+use,intrinsic::cudafor,only:deferredarraytensormaskreducop
+use,intrinsic::cudafor,only:deferredarraytensorreducaccop
+use,intrinsic::cudafor,only:deferredarrayintegerreducop
+use,intrinsic::cudafor,only:__pgi_sum2hwsi4_nd
+use,intrinsic::cudafor,only:__nvf_sum2hwsi4_nd_dim
+use,intrinsic::cudafor,only:__nvf_sum2hwsi4_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_sum2hwsi8_nd
+use,intrinsic::cudafor,only:__nvf_sum2hwsi8_nd_dim
+use,intrinsic::cudafor,only:__nvf_sum2hwsi8_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_sum2hwsr4_nd
+use,intrinsic::cudafor,only:__nvf_sum2hwsr4_nd_dim
+use,intrinsic::cudafor,only:__nvf_sum2hwsr4_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_sum2hwsr8_nd
+use,intrinsic::cudafor,only:__nvf_sum2hwsr8_nd_dim
+use,intrinsic::cudafor,only:__nvf_sum2hwsr8_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mxv2hwsi4_nd
+use,intrinsic::cudafor,only:__nvf_mxv2hwsi4_nd_dim
+use,intrinsic::cudafor,only:__nvf_mxv2hwsi4_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mxv2hwsi8_nd
+use,intrinsic::cudafor,only:__nvf_mxv2hwsi8_nd_dim
+use,intrinsic::cudafor,only:__nvf_mxv2hwsi8_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mxv2hwsr4_nd
+use,intrinsic::cudafor,only:__nvf_mxv2hwsr4_nd_dim
+use,intrinsic::cudafor,only:__nvf_mxv2hwsr4_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mxv2hwsr8_nd
+use,intrinsic::cudafor,only:__nvf_mxv2hwsr8_nd_dim
+use,intrinsic::cudafor,only:__nvf_mxv2hwsr8_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mnv2hwsi4_nd
+use,intrinsic::cudafor,only:__nvf_mnv2hwsi4_nd_dim
+use,intrinsic::cudafor,only:__nvf_mnv2hwsi4_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mnv2hwsi8_nd
+use,intrinsic::cudafor,only:__nvf_mnv2hwsi8_nd_dim
+use,intrinsic::cudafor,only:__nvf_mnv2hwsi8_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mnv2hwsr4_nd
+use,intrinsic::cudafor,only:__nvf_mnv2hwsr4_nd_dim
+use,intrinsic::cudafor,only:__nvf_mnv2hwsr4_nd_dim_msk
+use,intrinsic::cudafor,only:__pgi_mnv2hwsr8_nd
+use,intrinsic::cudafor,only:__nvf_mnv2hwsr8_nd_dim
+use,intrinsic::cudafor,only:__nvf_mnv2hwsr8_nd_dim_msk
+use,intrinsic::cudafor,only:__nvf_scale_i4_reduc1
+use,intrinsic::cudafor,only:__nvf_scale_i4_reduc2
+use,intrinsic::cudafor,only:__nvf_scale_i8_reduc1
+use,intrinsic::cudafor,only:__nvf_scale_i8_reduc2
+use,intrinsic::cudafor,only:__nvf_scale_r4_reduc1
+use,intrinsic::cudafor,only:__nvf_scale_r4_reduc2
+use,intrinsic::cudafor,only:__nvf_scale_r8_reduc1
+use,intrinsic::cudafor,only:__nvf_scale_r8_reduc2
+use,intrinsic::cudafor,only:__nvf_defer_assigni4_reduc
+use,intrinsic::cudafor,only:__nvf_defer_assigni8_reduc
+use,intrinsic::cudafor,only:__nvf_defer_assignr4_reduc
+use,intrinsic::cudafor,only:__nvf_defer_assignr8_reduc
+use,intrinsic::cudafor,only:__nvf_defer_assigni4_maskreduc
+use,intrinsic::cudafor,only:__nvf_defer_assigni8_maskreduc
+use,intrinsic::cudafor,only:__nvf_defer_assignr4_maskreduc
+use,intrinsic::cudafor,only:__nvf_defer_assignr8_maskreduc
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f1d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f2d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f3d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f4d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f5d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f6d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32f7d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f1d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f2d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f3d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f4d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f5d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f6d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64f7d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i1d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i2d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i3d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i4d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i5d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i6d
+use,intrinsic::cudafor,only:__nvf_maxlocr_32i7d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i1d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i2d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i3d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i4d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i5d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i6d
+use,intrinsic::cudafor,only:__nvf_maxlocr_64i7d
+use,intrinsic::cudafor,only:__nvf_maxlocdimr_32f1d
+use,intrinsic::cudafor,only:__nvf_maxlocdimr_64f1d
+use,intrinsic::cudafor,only:__nvf_maxlocdimr_32i1d
+use,intrinsic::cudafor,only:__nvf_maxlocdimr_64i1d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f1d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f2d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f3d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f4d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f5d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f6d
+use,intrinsic::cudafor,only:__nvf_minlocr_32f7d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f1d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f2d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f3d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f4d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f5d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f6d
+use,intrinsic::cudafor,only:__nvf_minlocr_64f7d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i1d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i2d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i3d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i4d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i5d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i6d
+use,intrinsic::cudafor,only:__nvf_minlocr_32i7d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i1d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i2d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i3d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i4d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i5d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i6d
+use,intrinsic::cudafor,only:__nvf_minlocr_64i7d
+use,intrinsic::cudafor,only:__nvf_minlocdimr_32f1d
+use,intrinsic::cudafor,only:__nvf_minlocdimr_64f1d
+use,intrinsic::cudafor,only:__nvf_minlocdimr_32i1d
+use,intrinsic::cudafor,only:__nvf_minlocdimr_64i1d
+use,intrinsic::cudafor,only:sum
+use,intrinsic::cudafor,only:maxval
+use,intrinsic::cudafor,only:minval
+use,intrinsic::cudafor,only:operator(*)
+use,intrinsic::cudafor,only:maxloc
+use,intrinsic::cudafor,only:minloc
+use,intrinsic::cudafor,only:sort_int4
+use,intrinsic::cudafor,only:sort_int8
+use,intrinsic::cudafor,only:sort_real4
+use,intrinsic::cudafor,only:sort_real8
+use,intrinsic::cudafor,only:sort_dev_int4
+use,intrinsic::cudafor,only:sort_dev_int8
+use,intrinsic::cudafor,only:sort_dev_real4
+use,intrinsic::cudafor,only:sort_dev_real8
+use,intrinsic::cudafor,only:sort_bk_int4
+use,intrinsic::cudafor,only:sort_bk_int8
+use,intrinsic::cudafor,only:sort_bk_real4
+use,intrinsic::cudafor,only:sort_bk_real8
+use,intrinsic::cudafor,only:sort_dev_bk_int4
+use,intrinsic::cudafor,only:sort_dev_bk_int8
+use,intrinsic::cudafor,only:sort_dev_bk_real4
+use,intrinsic::cudafor,only:sort_dev_bk_real8
+use,intrinsic::cudafor,only:__nvf_sort_host_i4_noidx
+use,intrinsic::cudafor,only:__nvf_sort_host_i8_noidx
+use,intrinsic::cudafor,only:__nvf_sort_host_r4_noidx
+use,intrinsic::cudafor,only:__nvf_sort_host_r8_noidx
+use,intrinsic::cudafor,only:__nvf_sort_host_i4_idx
+use,intrinsic::cudafor,only:__nvf_sort_host_i8_idx
+use,intrinsic::cudafor,only:__nvf_sort_host_r4_idx
+use,intrinsic::cudafor,only:__nvf_sort_host_r8_idx
+use,intrinsic::cudafor,only:__nvf_sort_dev_i4_noidx
+use,intrinsic::cudafor,only:__nvf_sort_dev_i8_noidx
+use,intrinsic::cudafor,only:__nvf_sort_dev_r4_noidx
+use,intrinsic::cudafor,only:__nvf_sort_dev_r8_noidx
+use,intrinsic::cudafor,only:__nvf_sort_dev_i4_idx
+use,intrinsic::cudafor,only:__nvf_sort_dev_i8_idx
+use,intrinsic::cudafor,only:__nvf_sort_dev_r4_idx
+use,intrinsic::cudafor,only:__nvf_sort_dev_r8_idx
+use,intrinsic::cudafor,only:csort
+use,intrinsic::cudafor,only:fsort
+use,intrinsic::cudafor,only:assignment(=)
+use,intrinsic::cudafor,only:operator(.eq.)
+use,intrinsic::__fortran_builtins,only:operator(.eq.)
+use,intrinsic::cudafor,only:operator(.ne.)
+use,intrinsic::__fortran_builtins,only:operator(.ne.)
+type::simple_type
+real(4)::val
+end type
+type::container
+type(c_devptr)::ptr
+end type
+interface operator(.eq.)
+end interface
+interface operator(.ne.)
+end interface
+contains
+attributes(device) subroutine test(rec)
+type(container),device::rec
+end
+end



More information about the flang-commits mailing list