[Mlir-commits] [mlir] [mlir][OpenMP] cast address space of private variables (PR #130301)
Tom Eccles
llvmlistbot at llvm.org
Fri Mar 7 08:16:56 PST 2025
https://github.com/tblah created https://github.com/llvm/llvm-project/pull/130301
Fixes #130159
The problem is that the alloca created for the private variable uses the default alloca address space in that module, but the function the pointer is being passed to expects a different address space, leading to a type missmatch in the function argument.
I know nothing about how AMDGPU is supposed to work. I based this solution on code from createDeviceArgumentAccessor(). Please could somebody from AMD confirm this solution is appropriate.
>From 08185bdc9cd34b24ab35a2775048a162fa2fe55e Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Fri, 7 Mar 2025 15:46:35 +0000
Subject: [PATCH] [mlir][OpenMP] cast address space of private variables
Fixes #130159
The problem is that the alloca created for the private variable uses the
default alloca address space in that module, but the function the
pointer is being passed to expects a different address space, leading to
a type missmatch in the function argument.
I know nothing about how AMDGPU is supposed to work. I based this
solution on code from createDeviceArgumentAccessor(). Please could
somebody from AMD confirm this solution is appropriate.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 10 ++++
.../Target/LLVMIR/omptarget-private-llvm.mlir | 46 +++++++++++++++++++
2 files changed, 56 insertions(+)
create mode 100644 mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 32c7c501d03c3..842308807cf02 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1452,6 +1452,12 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
+ unsigned int allocaAS =
+ moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
+ unsigned int defaultAS = moduleTranslation.getLLVMModule()
+ ->getDataLayout()
+ .getProgramAddressSpace();
+
for (auto [privDecl, mlirPrivVar, blockArg] :
llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) {
llvm::Type *llvmAllocType =
@@ -1459,6 +1465,10 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
llvm::Value *llvmPrivateVar = builder.CreateAlloca(
llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
+ if (allocaAS != defaultAS)
+ llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar,
+ builder.getPtrTy(defaultAS));
+
llvmPrivateVars.push_back(llvmPrivateVar);
}
diff --git a/mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir
new file mode 100644
index 0000000000000..a2500f3a579dd
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Regression tset for calling a function using pointer alloca'ed on
+// device for private variable
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+ omp.private {type = private} @_QMmodFfailingEi_private_i32 : i32
+ llvm.func @_QMotherProutine(%arg0: !llvm.ptr {fir.bindc_name = "i", llvm.nocapture}) attributes {frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>, target_cpu = "gfx90a", target_features = #llvm.target_features<["+16-bit-insts", "+atomic-buffer-global-pk-add-f16-insts", "+atomic-fadd-rtn-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx8-insts", "+gfx9-insts", "+gfx90a-insts", "+gws", "+image-insts", "+mai-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>} {
+ llvm.return
+ }
+ llvm.func @_QMmodPfailing(%arg0: !llvm.ptr {fir.bindc_name = "d", llvm.nocapture}) attributes {frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "gfx90a", target_features = #llvm.target_features<["+16-bit-insts", "+atomic-buffer-global-pk-add-f16-insts", "+atomic-fadd-rtn-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx8-insts", "+gfx9-insts", "+gfx90a-insts", "+gws", "+image-insts", "+mai-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>} {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ %3 = llvm.mlir.constant(1 : i64) : i64
+ %4 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+ %5 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "d"}
+ omp.target map_entries(%4 -> %arg1, %5 -> %arg2 : !llvm.ptr, !llvm.ptr) {
+ %6 = llvm.mlir.constant(1 : i32) : i32
+ omp.teams {
+
+// CHECK: omp.par.entry:
+// CHECK: %[[TID_ADDR_LOCAL:.*]] = alloca i32, align 4, addrspace(5)
+// CHECK: %[[OMP_PRIVATE_ALLOC:omp\.private\.alloc]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: %[[CAST:.*]] = addrspacecast ptr addrspace(5) %[[OMP_PRIVATE_ALLOC]] to ptr
+
+ omp.parallel private(@_QMmodFfailingEi_private_i32 %arg1 -> %arg3 : !llvm.ptr) {
+ %7 = llvm.load %arg2 : !llvm.ptr -> i32
+ omp.distribute {
+ omp.wsloop {
+ omp.loop_nest (%arg4) : i32 = (%6) to (%7) inclusive step (%6) {
+ llvm.store %arg4, %arg3 : i32, !llvm.ptr
+ llvm.call @_QMotherProutine(%arg3) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr) -> ()
+ omp.yield
+ }
+ } {omp.composite}
+ } {omp.composite}
+ omp.terminator
+ } {omp.composite}
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+}
More information about the Mlir-commits
mailing list