[Mlir-commits] [mlir] [mlir][ROCDL] Set the LLVM data layout when lowering to ROCDL LLVM (PR #74501)

Tue Dec 5 09:37:15 PST 2023

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/74501

In order to ensure operations lower correctly (especially memref.addrspacecast, which relies on the data layout benig set correctly then dealing with dynamic memrefs) and to prevent compilation issues later down the line, set the `llvm.data_layout` attribute on GPU modules when lowering their contents to a ROCDL / AMDGPU target.

If there's a good way to test the embedded string to prevent it from going out of sync with the LLVM TargetMachine, I'd appreciate hearing about it. (Or, alternatively, if there's a place I could farctor the string out to).

>From 8266d14252d045b343903c5d466dc9c6cc0299d0 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 5 Oct 2023 21:30:06 +0000
Subject: [PATCH] [mlir][ROCDL] Set the LLVM data layout when lowering to ROCDL
 LLVM

In order to ensure operations lower correctly (especially
memref.addrspacecast, which relies on the data layout benig set
correctly then dealing with dynamic memrefs) and to prevent
compilation issues later down the line, set the `llvm.data_layout`
attribute on GPU modules when lowering their contents to a ROCDL /
AMDGPU target.

If there's a good way to test the embedded string to prevent it from
going out of sync with the LLVM TargetMachine, I'd appreciate hearing
about it. (Or, alternatively, if there's a place I could farctor the
string out to).
---
 .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp  | 12 ++++++++++++
 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir    | 10 ++++++++++
 2 files changed, 22 insertions(+)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d9f94e30b04c6..c93b33e761dd9 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -75,6 +75,11 @@ Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
                                                    ValueRange{minus1, mbcntLo});
   return laneId;
 }
+static constexpr StringLiteral amdgcnDataLayout =
+    "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+    "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
+    "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
+    "G1-ni:7:8";
 
 namespace {
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
@@ -212,6 +217,12 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
 
+    auto llvmDataLayout = m->getAttrOfType<StringAttr>(
+        LLVM::LLVMDialect::getDataLayoutAttrName());
+    if (!llvmDataLayout) {
+      llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
+      m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
+    }
     // Request C wrapper emission.
     for (auto func : m.getOps<func::FuncOp>()) {
       func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
@@ -227,6 +238,7 @@ struct LowerGpuOpsToROCDLOpsPass
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
+    options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
 
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 2652b86657099..8a2d8bd7967ca 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
+// CHECK-LABEL: @test_module
+// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   // CHECK32-LABEL: func @gpu_index_ops()
@@ -628,3 +630,11 @@ gpu.module @test_module {
     func.return %shfl, %shfli : f32, f32
   }
 }
+
+// -----
+
+// CHECK-LABEL: @test_custom_data_layout
+// CHECK-SAME: llvm.data_layout = "e"
+gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} {
+
+}