[flang-commits] [flang] 829e899 - [flang][cuda] Lower __LDCA, __LDCS, __LDLU, __LDCV, __LDCG with arrays (#130357)
via flang-commits
flang-commits at lists.llvm.org
Fri Mar 7 15:35:54 PST 2025
Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-03-07T15:35:52-08:00
New Revision: 829e8993e5998f69fe04d23a815aeab51de9a4c5
URL: https://github.com/llvm/llvm-project/commit/829e8993e5998f69fe04d23a815aeab51de9a4c5
DIFF: https://github.com/llvm/llvm-project/commit/829e8993e5998f69fe04d23a815aeab51de9a4c5.diff
LOG: [flang][cuda] Lower __LDCA, __LDCS, __LDLU, __LDCV, __LDCG with arrays (#130357)
Added:
Modified:
flang/include/flang/Optimizer/Builder/IntrinsicCall.h
flang/lib/Optimizer/Builder/IntrinsicCall.cpp
flang/test/Lower/CUDA/cuda-device-proc.cuf
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 778fcf1d93d8b..c3685556f77b3 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -224,6 +224,9 @@ struct IntrinsicLibrary {
fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+ template <const char *fctName, int extent>
+ fir::ExtendedValue genCUDALDXXFunc(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 0a70ed3c6bdf1..04059f3943514 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -106,6 +106,34 @@ using I = IntrinsicLibrary;
/// argument is an optional variable in the current scope).
static constexpr bool handleDynamicOptional = true;
+/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to
+/// PPC.
+static const char __ldca_i4x4[] = "__ldca_i4x4_";
+static const char __ldca_i8x2[] = "__ldca_i8x2_";
+static const char __ldca_r2x2[] = "__ldca_r2x2_";
+static const char __ldca_r4x4[] = "__ldca_r4x4_";
+static const char __ldca_r8x2[] = "__ldca_r8x2_";
+static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
+static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
+static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
+static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
+static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
+static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
+static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
+static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
+static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
+static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
+static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
+static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
+static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
+static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
+static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
+static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
+static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
+static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
+static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
+static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
+
/// Table that drives the fir generation depending on the intrinsic or intrinsic
/// module procedure one to one mapping with Fortran arguments. If no mapping is
/// defined here for a generic intrinsic, genRuntimeCall will be called
@@ -114,6 +142,106 @@ static constexpr bool handleDynamicOptional = true;
/// argument must not be lowered by value. In which case, the lowering rules
/// should be provided for all the intrinsic arguments for completeness.
static constexpr IntrinsicHandler handlers[]{
+ {"__ldca_i4x4",
+ &I::genCUDALDXXFunc<__ldca_i4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_i8x2",
+ &I::genCUDALDXXFunc<__ldca_i8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r2x2",
+ &I::genCUDALDXXFunc<__ldca_r2x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r4x4",
+ &I::genCUDALDXXFunc<__ldca_r4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r8x2",
+ &I::genCUDALDXXFunc<__ldca_r8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_i4x4",
+ &I::genCUDALDXXFunc<__ldcg_i4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_i8x2",
+ &I::genCUDALDXXFunc<__ldcg_i8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r2x2",
+ &I::genCUDALDXXFunc<__ldcg_r2x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r4x4",
+ &I::genCUDALDXXFunc<__ldcg_r4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r8x2",
+ &I::genCUDALDXXFunc<__ldcg_r8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_i4x4",
+ &I::genCUDALDXXFunc<__ldcs_i4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_i8x2",
+ &I::genCUDALDXXFunc<__ldcs_i8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r2x2",
+ &I::genCUDALDXXFunc<__ldcs_r2x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r4x4",
+ &I::genCUDALDXXFunc<__ldcs_r4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r8x2",
+ &I::genCUDALDXXFunc<__ldcs_r8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_i4x4",
+ &I::genCUDALDXXFunc<__ldcv_i4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_i8x2",
+ &I::genCUDALDXXFunc<__ldcv_i8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r2x2",
+ &I::genCUDALDXXFunc<__ldcv_r2x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r4x4",
+ &I::genCUDALDXXFunc<__ldcv_r4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r8x2",
+ &I::genCUDALDXXFunc<__ldcv_r8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_i4x4",
+ &I::genCUDALDXXFunc<__ldlu_i4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_i8x2",
+ &I::genCUDALDXXFunc<__ldlu_i8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_r2x2",
+ &I::genCUDALDXXFunc<__ldlu_r2x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_r4x4",
+ &I::genCUDALDXXFunc<__ldlu_r4x4, 4>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_r8x2",
+ &I::genCUDALDXXFunc<__ldlu_r8x2, 2>,
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
{"abort", &I::genAbort},
{"abs", &I::genAbs},
{"achar", &I::genChar},
@@ -3544,6 +3672,29 @@ IntrinsicLibrary::genCshift(mlir::Type resultType,
return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT");
}
+// __LDCA, __LDCS, __LDLU, __LDCV
+template <const char *fctName, int extent>
+fir::ExtendedValue
+IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1);
+ mlir::Type resTy = fir::SequenceType::get(extent, resultType);
+ mlir::Value arg = fir::getBase(args[0]);
+ mlir::Value res = builder.create<fir::AllocaOp>(loc, resTy);
+ if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+ arg = builder.create<fir::BoxAddrOp>(loc, arg);
+ mlir::FunctionType ftype =
+ mlir::FunctionType::get(arg.getContext(), {resTy, resTy}, {});
+ auto funcOp = builder.createFunction(loc, fctName, ftype);
+ llvm::SmallVector<mlir::Value> funcArgs;
+ funcArgs.push_back(res);
+ funcArgs.push_back(arg);
+ builder.create<fir::CallOp>(loc, funcOp, funcArgs);
+ mlir::Value ext =
+ builder.createIntegerConstant(loc, builder.getIndexType(), extent);
+ return fir::ArrayBoxValue(res, {ext});
+}
+
// DATE_AND_TIME
void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4 && "date_and_time has 4 args");
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 5c7f334bae8ba..2cdaaff6a33ea 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -217,10 +217,11 @@ attributes(global) subroutine __ldXXi4(b)
end
! CHECK-LABEL: func.func @_QP__ldxxi4
-! CHECK: __ldca_i4x4
-! CHECK: __ldcg_i4x4
-! CHECK: __ldcs_i4x4
-! CHECK: __ldlu_i4x4
+! CHECK: fir.call @__ldca_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK: fir.call @__ldcg_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK: fir.call @__ldcs_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK: fir.call @__ldlu_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK: fir.call @__ldcv_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
attributes(global) subroutine __ldXXi8(b)
integer(8), device :: b(*)
@@ -233,10 +234,11 @@ attributes(global) subroutine __ldXXi8(b)
end
! CHECK-LABEL: func.func @_QP__ldxxi8
-! CHECK: __ldca_i8x2
-! CHECK: __ldcg_i8x2
-! CHECK: __ldcs_i8x2
-! CHECK: __ldlu_i8x2
+! CHECK: fir.call @__ldca_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
+! CHECK: fir.call @__ldcg_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
+! CHECK: fir.call @__ldcs_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
+! CHECK: fir.call @__ldlu_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
+! CHECK: fir.call @__ldcv_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
attributes(global) subroutine __ldXXr4(b)
real, device :: b(*)
@@ -249,10 +251,11 @@ attributes(global) subroutine __ldXXr4(b)
end
! CHECK-LABEL: func.func @_QP__ldxxr4
-! CHECK: __ldca_r4x4
-! CHECK: __ldcg_r4x4
-! CHECK: __ldcs_r4x4
-! CHECK: __ldlu_r4x4
+! CHECK: fir.call @__ldca_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
+! CHECK: fir.call @__ldcg_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
+! CHECK: fir.call @__ldcs_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
+! CHECK: fir.call @__ldlu_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
+! CHECK: fir.call @__ldcv_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
attributes(global) subroutine __ldXXr2(b)
real(2), device :: b(*)
@@ -265,10 +268,11 @@ attributes(global) subroutine __ldXXr2(b)
end
! CHECK-LABEL: func.func @_QP__ldxxr2
-! CHECK: __ldca_r2x2
-! CHECK: __ldcg_r2x2
-! CHECK: __ldcs_r2x2
-! CHECK: __ldlu_r2x2
+! CHECK: fir.call @__ldca_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
+! CHECK: fir.call @__ldcg_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
+! CHECK: fir.call @__ldcs_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
+! CHECK: fir.call @__ldlu_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
+! CHECK: fir.call @__ldcv_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
attributes(global) subroutine __ldXXr8(b)
real(8), device :: b(*)
@@ -281,7 +285,8 @@ attributes(global) subroutine __ldXXr8(b)
end
! CHECK-LABEL: func.func @_QP__ldxxr8
-! CHECK: __ldca_r8x2
-! CHECK: __ldcg_r8x2
-! CHECK: __ldcs_r8x2
-! CHECK: __ldlu_r8x2
+! CHECK: fir.call @__ldca_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
+! CHECK: fir.call @__ldcg_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
+! CHECK: fir.call @__ldcs_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
+! CHECK: fir.call @__ldlu_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
+! CHECK: fir.call @__ldcv_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
More information about the flang-commits
mailing list