[Mlir-commits] [mlir] [MLIR][AMDGPU] Add amdgpu.global_transpose_load op for RDNA4 global memory transpose loads (PR #195287)
Nirvedh Meshram
llvmlistbot at llvm.org
Fri May 1 11:50:51 PDT 2026
https://github.com/nirvedhmeshram updated https://github.com/llvm/llvm-project/pull/195287
>From 25240c4708f95a5473189529ce12f0f07edaefe0 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh at gmail.com>
Date: Fri, 1 May 2026 11:56:49 -0400
Subject: [PATCH 1/6] [MLIR][AMDGPU] Add amdgpu.global_transpose_load op for
RDNA4 global memory transpose loads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Adds a new `amdgpu.global_transpose_load` op to the AMDGPU dialect that
wraps the `global_load_tr` family of instructions introduced in RDNA4
(gfx1250+). Each thread reads a column of a matrix from global memory
and receives the corresponding transposed row in its result register.
The op is kept separate from the existing `amdgpu.transpose_load` (which
targets LDS via `ds_read_tr` on gfx950+) because the two variants target
different GPU architecture families, have different chipset requirements,
and differ in their valid (element size, num elements) combinations — in
particular the 16-bit case produces a 128-bit (8-element) result via
`global_load_tr.b128` rather than the 64-bit (4-element) result from
`ds_read_tr16.b64`.
Lowering to the existing ROCDL `global.load.tr{4,6,.}.b{64,96,128}`
intrinsics added for gfx1250+.
Co-authored-by: Claude Sonnet 4 (1M context) <noreply at anthropic.com>
Signed-off-by: Nirvedh Meshram <nirvedh at gmail.com>
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 41 ++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 75 +++++++++++++++++++
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 37 +++++++++
.../AMDGPUToROCDL/global_transpose_load.mlir | 54 +++++++++++++
4 files changed, 207 insertions(+)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 4112ea281bb96..53bf8a31c5415 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1449,6 +1449,47 @@ def AMDGPU_TransposeLoadOp :
let hasVerifier = 1;
}
+def AMDGPU_GlobalTransposeLoadOp :
+ AMDGPU_Op<"global_transpose_load", [SameVariadicOperandSize]>,
+ Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src,
+ Variadic<Index>:$srcIndices)>,
+ Results<(outs AnyTypeOf<[AnyVectorOfNonZeroRank]>:$result)> {
+ let summary = "MLIR wrapper for RDNA4 global memory transpose load instructions";
+ let description = [{
+ The `amdgpu.global_transpose_load` op is a wrapper around the
+ `global_load_tr` family of instructions introduced in RDNA4 (gfx1250+).
+
+ Each thread reads a column of a matrix stored in global memory and receives
+ the corresponding row of the transposed matrix in its result register.
+ The subgroup collectively performs a transpose of the tile.
+
+ This op is a direct wrapper around the ROCDL `global.load.tr` family
+ intrinsics. Refer to the RDNA4 ISA documentation for exact semantics.
+
+ Format example:
+ ```
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x256xf16, #gpu.address_space<global>> -> vector<8xf16>
+ ```
+ Operands:
+ * `$src`: Global address space memref to read from.
+ * `$srcIndices`: indices into `$src` for this thread.
+ * `$result`: register this transpose load instruction writes to.
+
+ Valid (element bits, num elements) pairs:
+ * (4, 16) -> global_load_tr4_b64
+ * (6, 16) -> global_load_tr6_b96
+ * (8, 8) -> global_load_tr8_b64
+ * (16, 8) -> global_load_tr.b128
+
+ Note: Lowering is only supported on gfx1250 and up.
+ }];
+ let assemblyFormat = [{
+ $src `[` $srcIndices `]` attr-dict `:` type($src) `->` type($result)
+ }];
+ let hasVerifier = 1;
+}
+
def AMDGPU_ScaledMFMAOp :
AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>,
Pure]>,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 8464d1e29f0aa..044d79193f674 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2226,6 +2226,80 @@ struct TransposeLoadOpLowering
}
};
+struct GlobalTransposeLoadOpLowering
+ : public ConvertOpToLLVMPattern<GlobalTransposeLoadOp> {
+ GlobalTransposeLoadOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<GlobalTransposeLoadOp>(converter),
+ chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(GlobalTransposeLoadOp op,
+ GlobalTransposeLoadOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op.emitOpError(
+ "global_transpose_load is only supported on gfx1250+");
+
+ Location loc = op.getLoc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+ auto resultType = cast<VectorType>(op.getResult().getType());
+
+ Value srcPtr =
+ getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
+ adaptor.getSrcIndices());
+
+ size_t numElements = resultType.getNumElements();
+ size_t elementTypeSize =
+ resultType.getElementType().getIntOrFloatBitWidth();
+
+ // ROCDL global transpose load intrinsics return vectors of i32 for
+ // sub-16-bit elements, matching the LDS lowering convention.
+ Type rocdlResultType =
+ elementTypeSize < 16
+ ? VectorType::get((numElements * elementTypeSize) / 32,
+ rewriter.getIntegerType(32))
+ : typeConverter->convertType(resultType);
+ Type llvmResultType = typeConverter->convertType(resultType);
+
+ switch (elementTypeSize) {
+ case 4: {
+ assert(numElements == 16);
+ auto rocdlOp = ROCDL::GlobalLoadTr4_B64::create(rewriter, loc,
+ rocdlResultType, srcPtr);
+ rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
+ break;
+ }
+ case 6: {
+ assert(numElements == 16);
+ auto rocdlOp = ROCDL::GlobalLoadTr6_B96::create(rewriter, loc,
+ rocdlResultType, srcPtr);
+ rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
+ break;
+ }
+ case 8: {
+ assert(numElements == 8);
+ auto rocdlOp = ROCDL::GlobalLoadTr8_B64::create(rewriter, loc,
+ rocdlResultType, srcPtr);
+ rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
+ break;
+ }
+ case 16: {
+ assert(numElements == 8);
+ rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadTr8_B128>(op, llvmResultType,
+ srcPtr);
+ break;
+ }
+ default:
+ return op.emitOpError(
+ "unsupported element size for global transpose load");
+ }
+ return success();
+ }
+};
+
struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
GatherToLDSOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
: ConvertOpToLLVMPattern<GatherToLDSOp>(converter), chipset(chipset) {}
@@ -4408,6 +4482,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
GlobalLoadAsyncToLDSOpLowering, TransposeLoadOpLowering,
+ GlobalTransposeLoadOpLowering,
AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
AMDGPULowerDescriptor<MakeDmaDescriptorOp>,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 2f6f59194fba3..bf0d5a47c6417 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1079,6 +1079,43 @@ LogicalResult TransposeLoadOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// GlobalTransposeLoadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult GlobalTransposeLoadOp::verify() {
+ MemRefType srcType = cast<MemRefType>(getSrc().getType());
+
+ if (!hasGlobalMemorySpace(srcType.getMemorySpace()))
+ return emitOpError("source memory address space must be Global");
+
+ auto resultType = cast<VectorType>(getType());
+ size_t numElements = resultType.getNumElements();
+ size_t elementTypeSize =
+ resultType.getElementType().getIntOrFloatBitWidth();
+
+ // ElementSize -> NumElements (matches global_load_tr* ISA variants)
+ const llvm::SmallDenseMap<size_t, size_t> kValidLoadSizeMap = {
+ {4, 16}, // global_load_tr4_b64
+ {6, 16}, // global_load_tr6_b96
+ {8, 8}, // global_load_tr8_b64
+ {16, 8}, // global_load_tr.b128
+ };
+
+ auto validNumElems = kValidLoadSizeMap.find(elementTypeSize);
+ if (validNumElems == kValidLoadSizeMap.end())
+ return emitOpError(
+ "unsupported element type size for global transpose load: ")
+ << elementTypeSize << " bits";
+
+ if (numElements != validNumElems->second)
+ return emitOpError(
+ "transferring type size mismatch: expected num of elements: ")
+ << validNumElems->second;
+
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// MakeDmaBaseOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
new file mode 100644
index 0000000000000..8ac65363ed8c1
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx942 2>&1 | FileCheck %s --check-prefix=CHECK-OLD
+
+// CHECK-LABEL: func @global_transpose_load_8xf16
+func.func @global_transpose_load_8xf16(%i : index, %j : index,
+ %src : memref<128x256xf16, #gpu.address_space<global>>) -> vector<8xf16> {
+ // CHECK: rocdl.global.load.tr.b128
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x256xf16, #gpu.address_space<global>> -> vector<8xf16>
+ return %0 : vector<8xf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @global_transpose_load_8xi8
+func.func @global_transpose_load_8xi8(%i : index, %j : index,
+ %src : memref<128x256xi8, #gpu.address_space<global>>) -> vector<8xi8> {
+ // CHECK: %[[RES:.*]] = rocdl.global.load.tr.b64
+ // CHECK-SAME: -> vector<2xi32>
+ // CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<8xi8>
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x256xi8, #gpu.address_space<global>> -> vector<8xi8>
+ return %0 : vector<8xi8>
+}
+
+// -----
+
+// CHECK-LABEL: func @global_transpose_load_16xi4
+func.func @global_transpose_load_16xi4(%i : index, %j : index,
+ %src : memref<128x32xi8, #gpu.address_space<global>>) -> vector<16xi4> {
+ // CHECK: %[[RES:.*]] = rocdl.global.load.tr4.b64
+ // CHECK-SAME: -> vector<2xi32>
+ // CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<16xi4>
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x32xi8, #gpu.address_space<global>> -> vector<16xi4>
+ return %0 : vector<16xi4>
+}
+
+// -----
+
+// CHECK-LABEL: func @global_transpose_load_16xi6
+func.func @global_transpose_load_16xi6(%i : index, %j : index,
+ %src : memref<128x32xi8, #gpu.address_space<global>>) -> vector<16xi6> {
+ // CHECK: %[[RES:.*]] = rocdl.global.load.tr6.b96
+ // CHECK-SAME: -> vector<3xi32>
+ // CHECK-NEXT: llvm.bitcast %[[RES]] : vector<3xi32> to vector<16xi6>
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x32xi8, #gpu.address_space<global>> -> vector<16xi6>
+ return %0 : vector<16xi6>
+}
>From 209f0f145d3b55abe911a7921310ae4b5340ed36 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh at gmail.com>
Date: Fri, 1 May 2026 12:28:38 -0400
Subject: [PATCH 2/6] remove bitwidth 4 and 6 support as ISA doesnt mention it
Signed-off-by: Nirvedh Meshram <nirvedh at gmail.com>
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 6 +--
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 14 -------
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 8 ++--
.../AMDGPUToROCDL/global_transpose_load.mlir | 40 ++++++++++---------
4 files changed, 26 insertions(+), 42 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 53bf8a31c5415..edc68d8c0e590 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1477,10 +1477,8 @@ def AMDGPU_GlobalTransposeLoadOp :
* `$result`: register this transpose load instruction writes to.
Valid (element bits, num elements) pairs:
- * (4, 16) -> global_load_tr4_b64
- * (6, 16) -> global_load_tr6_b96
- * (8, 8) -> global_load_tr8_b64
- * (16, 8) -> global_load_tr.b128
+ * (8, 8) -> global_load_tr_b64
+ * (16, 8) -> global_load_tr_b128
Note: Lowering is only supported on gfx1250 and up.
}];
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 044d79193f674..5844a845bd9e6 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2265,20 +2265,6 @@ struct GlobalTransposeLoadOpLowering
Type llvmResultType = typeConverter->convertType(resultType);
switch (elementTypeSize) {
- case 4: {
- assert(numElements == 16);
- auto rocdlOp = ROCDL::GlobalLoadTr4_B64::create(rewriter, loc,
- rocdlResultType, srcPtr);
- rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
- break;
- }
- case 6: {
- assert(numElements == 16);
- auto rocdlOp = ROCDL::GlobalLoadTr6_B96::create(rewriter, loc,
- rocdlResultType, srcPtr);
- rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
- break;
- }
case 8: {
assert(numElements == 8);
auto rocdlOp = ROCDL::GlobalLoadTr8_B64::create(rewriter, loc,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index bf0d5a47c6417..7d9bccd899a69 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1094,12 +1094,10 @@ LogicalResult GlobalTransposeLoadOp::verify() {
size_t elementTypeSize =
resultType.getElementType().getIntOrFloatBitWidth();
- // ElementSize -> NumElements (matches global_load_tr* ISA variants)
+ // ElementSize -> NumElements (matches ISA-documented global_load_tr variants)
const llvm::SmallDenseMap<size_t, size_t> kValidLoadSizeMap = {
- {4, 16}, // global_load_tr4_b64
- {6, 16}, // global_load_tr6_b96
- {8, 8}, // global_load_tr8_b64
- {16, 8}, // global_load_tr.b128
+ {8, 8}, // global_load_tr_b64
+ {16, 8}, // global_load_tr_b128
};
auto validNumElems = kValidLoadSizeMap.find(elementTypeSize);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
index 8ac65363ed8c1..159706ccb53fd 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+// RUN: mlir-opt %s --split-input-file --verify-diagnostics -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx942 2>&1 | FileCheck %s --check-prefix=CHECK-OLD
// CHECK-LABEL: func @global_transpose_load_8xf16
@@ -27,28 +27,30 @@ func.func @global_transpose_load_8xi8(%i : index, %j : index,
// -----
-// CHECK-LABEL: func @global_transpose_load_16xi4
-func.func @global_transpose_load_16xi4(%i : index, %j : index,
- %src : memref<128x32xi8, #gpu.address_space<global>>) -> vector<16xi4> {
- // CHECK: %[[RES:.*]] = rocdl.global.load.tr4.b64
- // CHECK-SAME: -> vector<2xi32>
- // CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<16xi4>
- // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+func.func @global_transpose_load_wrong_addrspace(%i : index, %j : index,
+ %src : memref<128x256xf16, 3>) -> vector<8xf16> {
+ // expected-error at +1 {{'amdgpu.global_transpose_load' op source memory address space must be Global}}
%0 = amdgpu.global_transpose_load %src[%i, %j]
- : memref<128x32xi8, #gpu.address_space<global>> -> vector<16xi4>
- return %0 : vector<16xi4>
+ : memref<128x256xf16, 3> -> vector<8xf16>
+ return %0 : vector<8xf16>
}
// -----
-// CHECK-LABEL: func @global_transpose_load_16xi6
-func.func @global_transpose_load_16xi6(%i : index, %j : index,
- %src : memref<128x32xi8, #gpu.address_space<global>>) -> vector<16xi6> {
- // CHECK: %[[RES:.*]] = rocdl.global.load.tr6.b96
- // CHECK-SAME: -> vector<3xi32>
- // CHECK-NEXT: llvm.bitcast %[[RES]] : vector<3xi32> to vector<16xi6>
- // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+func.func @global_transpose_load_unsupported_f32(%i : index, %j : index,
+ %src : memref<128x256xf32, #gpu.address_space<global>>) -> vector<8xf32> {
+ // expected-error at +1 {{'amdgpu.global_transpose_load' op unsupported element type size for global transpose load: 32 bits}}
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x256xf32, #gpu.address_space<global>> -> vector<8xf32>
+ return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @global_transpose_load_wrong_num_elements(%i : index, %j : index,
+ %src : memref<128x256xf16, #gpu.address_space<global>>) -> vector<4xf16> {
+ // expected-error at +1 {{'amdgpu.global_transpose_load' op transferring type size mismatch: expected num of elements: 8}}
%0 = amdgpu.global_transpose_load %src[%i, %j]
- : memref<128x32xi8, #gpu.address_space<global>> -> vector<16xi6>
- return %0 : vector<16xi6>
+ : memref<128x256xf16, #gpu.address_space<global>> -> vector<4xf16>
+ return %0 : vector<4xf16>
}
>From 116b9ccd6a568c2c74f8988e5f48c5f6bd98a5b0 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh at gmail.com>
Date: Fri, 1 May 2026 12:36:52 -0400
Subject: [PATCH 3/6] format
Signed-off-by: Nirvedh Meshram <nirvedh at gmail.com>
---
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 7d9bccd899a69..e426f7c04be37 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1096,8 +1096,8 @@ LogicalResult GlobalTransposeLoadOp::verify() {
// ElementSize -> NumElements (matches ISA-documented global_load_tr variants)
const llvm::SmallDenseMap<size_t, size_t> kValidLoadSizeMap = {
- {8, 8}, // global_load_tr_b64
- {16, 8}, // global_load_tr_b128
+ {8, 8}, // global_load_tr_b64
+ {16, 8}, // global_load_tr_b128
};
auto validNumElems = kValidLoadSizeMap.find(elementTypeSize);
>From b3831198d5fede3f8da3b23e704e7f532740affc Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh at gmail.com>
Date: Fri, 1 May 2026 13:10:28 -0400
Subject: [PATCH 4/6] fix chip to be gfx1200+
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 4 ++--
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 5 +++--
.../Conversion/AMDGPUToROCDL/global_transpose_load.mlir | 6 +++---
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index edc68d8c0e590..6a9c51026baa8 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1457,7 +1457,7 @@ def AMDGPU_GlobalTransposeLoadOp :
let summary = "MLIR wrapper for RDNA4 global memory transpose load instructions";
let description = [{
The `amdgpu.global_transpose_load` op is a wrapper around the
- `global_load_tr` family of instructions introduced in RDNA4 (gfx1250+).
+ `global_load_tr` family of instructions introduced in RDNA4 (gfx1200+).
Each thread reads a column of a matrix stored in global memory and receives
the corresponding row of the transposed matrix in its result register.
@@ -1480,7 +1480,7 @@ def AMDGPU_GlobalTransposeLoadOp :
* (8, 8) -> global_load_tr_b64
* (16, 8) -> global_load_tr_b128
- Note: Lowering is only supported on gfx1250 and up.
+ Note: Lowering is only supported on gfx1200 and up.
}];
let assemblyFormat = [{
$src `[` $srcIndices `]` attr-dict `:` type($src) `->` type($result)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5844a845bd9e6..d1552d0b16ed9 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -48,6 +48,7 @@ constexpr Chipset kGfx908 = Chipset(9, 0, 8);
constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
constexpr Chipset kGfx942 = Chipset(9, 4, 2);
constexpr Chipset kGfx950 = Chipset(9, 5, 0);
+constexpr Chipset kGfx1200 = Chipset(12, 0, 0);
constexpr Chipset kGfx1250 = Chipset(12, 5, 0);
// Predicates mirroring the LLVM AMDGPU `HasDot{N}Insts` features that gate
@@ -2239,9 +2240,9 @@ struct GlobalTransposeLoadOpLowering
matchAndRewrite(GlobalTransposeLoadOp op,
GlobalTransposeLoadOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- if (chipset < kGfx1250)
+ if (chipset < kGfx1200)
return op.emitOpError(
- "global_transpose_load is only supported on gfx1250+");
+ "global_transpose_load is only supported on gfx1200+");
Location loc = op.getLoc();
auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
index 159706ccb53fd..80eab3ac8c971 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
@@ -1,11 +1,11 @@
-// RUN: mlir-opt %s --split-input-file --verify-diagnostics -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+// RUN: mlir-opt %s --split-input-file --verify-diagnostics -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx942 2>&1 | FileCheck %s --check-prefix=CHECK-OLD
// CHECK-LABEL: func @global_transpose_load_8xf16
func.func @global_transpose_load_8xf16(%i : index, %j : index,
%src : memref<128x256xf16, #gpu.address_space<global>>) -> vector<8xf16> {
// CHECK: rocdl.global.load.tr.b128
- // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1200+
%0 = amdgpu.global_transpose_load %src[%i, %j]
: memref<128x256xf16, #gpu.address_space<global>> -> vector<8xf16>
return %0 : vector<8xf16>
@@ -19,7 +19,7 @@ func.func @global_transpose_load_8xi8(%i : index, %j : index,
// CHECK: %[[RES:.*]] = rocdl.global.load.tr.b64
// CHECK-SAME: -> vector<2xi32>
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<8xi8>
- // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1250+
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1200+
%0 = amdgpu.global_transpose_load %src[%i, %j]
: memref<128x256xi8, #gpu.address_space<global>> -> vector<8xi8>
return %0 : vector<8xi8>
>From 41dfe602b505a818c0313907700d38c23c6f4b8b Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh at gmail.com>
Date: Fri, 1 May 2026 13:46:15 -0400
Subject: [PATCH 5/6] format
Signed-off-by: Nirvedh Meshram <nirvedh at gmail.com>
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 ++--
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 3 +--
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index d1552d0b16ed9..a86102dceb870 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -4469,8 +4469,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
GlobalLoadAsyncToLDSOpLowering, TransposeLoadOpLowering,
- GlobalTransposeLoadOpLowering,
- AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
+ GlobalTransposeLoadOpLowering, AMDGPUPermlaneLowering,
+ AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
AMDGPULowerDescriptor<MakeDmaDescriptorOp>,
AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index e426f7c04be37..d50448a6d72b3 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1091,8 +1091,7 @@ LogicalResult GlobalTransposeLoadOp::verify() {
auto resultType = cast<VectorType>(getType());
size_t numElements = resultType.getNumElements();
- size_t elementTypeSize =
- resultType.getElementType().getIntOrFloatBitWidth();
+ size_t elementTypeSize = resultType.getElementType().getIntOrFloatBitWidth();
// ElementSize -> NumElements (matches ISA-documented global_load_tr variants)
const llvm::SmallDenseMap<size_t, size_t> kValidLoadSizeMap = {
>From 5355ebf561951f6ce7e60d69324d16645502a308 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh at gmail.com>
Date: Fri, 1 May 2026 14:49:55 -0400
Subject: [PATCH 6/6] address reviwer comments
Signed-off-by: Nirvedh Meshram <nirvedh at gmail.com>
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 18 +++++---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 24 +++++++++--
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 9 ++--
.../AMDGPUToROCDL/global_transpose_load.mlir | 43 ++++++++++---------
mlir/test/Dialect/AMDGPU/invalid.mlir | 10 +++++
mlir/test/Dialect/AMDGPU/ops.mlir | 9 ++++
6 files changed, 82 insertions(+), 31 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 6a9c51026baa8..e59ba1331958c 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1453,7 +1453,10 @@ def AMDGPU_GlobalTransposeLoadOp :
AMDGPU_Op<"global_transpose_load", [SameVariadicOperandSize]>,
Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src,
Variadic<Index>:$srcIndices)>,
- Results<(outs AnyTypeOf<[AnyVectorOfNonZeroRank]>:$result)> {
+ Results<(outs AnyTypeOf<[
+ FixedVectorOfLengthAndType<[8], [I8, F16, BF16, I16]>,
+ FixedVectorOfLengthAndType<[16], [I<4>, I<6>]>
+ ]>:$result)> {
let summary = "MLIR wrapper for RDNA4 global memory transpose load instructions";
let description = [{
The `amdgpu.global_transpose_load` op is a wrapper around the
@@ -1473,14 +1476,19 @@ def AMDGPU_GlobalTransposeLoadOp :
```
Operands:
* `$src`: Global address space memref to read from.
- * `$srcIndices`: indices into `$src` for this thread.
+ * `$srcIndices`: indices into `$src` for this thread. Indices must be
+ non-negative and in-bounds for the corresponding dimension of `$src`,
+ matching the constraints of `memref.load`.
* `$result`: register this transpose load instruction writes to.
Valid (element bits, num elements) pairs:
- * (8, 8) -> global_load_tr_b64
- * (16, 8) -> global_load_tr_b128
+ * (4, 16) -> global_load_tr4_b64 (gfx1250+)
+ * (6, 16) -> global_load_tr6_b96 (gfx1250+)
+ * (8, 8) -> global_load_tr_b64 (gfx1200+)
+ * (16, 8) -> global_load_tr_b128 (gfx1200+)
- Note: Lowering is only supported on gfx1200 and up.
+ Note: 8-bit and 16-bit element lowering requires gfx1200+.
+ 4-bit and 6-bit element lowering requires gfx1250+.
}];
let assemblyFormat = [{
$src `[` $srcIndices `]` attr-dict `:` type($src) `->` type($result)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a86102dceb870..78a87e3972d92 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2248,9 +2248,9 @@ struct GlobalTransposeLoadOpLowering
auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
auto resultType = cast<VectorType>(op.getResult().getType());
- Value srcPtr =
- getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
- adaptor.getSrcIndices());
+ Value srcPtr = getStridedElementPtr(
+ rewriter, loc, srcMemRefType, adaptor.getSrc(), adaptor.getSrcIndices(),
+ LLVM::GEPNoWrapFlags::inbounds | LLVM::GEPNoWrapFlags::nuw);
size_t numElements = resultType.getNumElements();
size_t elementTypeSize =
@@ -2266,6 +2266,24 @@ struct GlobalTransposeLoadOpLowering
Type llvmResultType = typeConverter->convertType(resultType);
switch (elementTypeSize) {
+ case 4: {
+ assert(numElements == 16);
+ if (chipset < kGfx1250)
+ return op.emitOpError("4-bit global_transpose_load requires gfx1250+");
+ auto rocdlOp = ROCDL::GlobalLoadTr4_B64::create(rewriter, loc,
+ rocdlResultType, srcPtr);
+ rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
+ break;
+ }
+ case 6: {
+ assert(numElements == 16);
+ if (chipset < kGfx1250)
+ return op.emitOpError("6-bit global_transpose_load requires gfx1250+");
+ auto rocdlOp = ROCDL::GlobalLoadTr6_B96::create(rewriter, loc,
+ rocdlResultType, srcPtr);
+ rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, llvmResultType, rocdlOp);
+ break;
+ }
case 8: {
assert(numElements == 8);
auto rocdlOp = ROCDL::GlobalLoadTr8_B64::create(rewriter, loc,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index d50448a6d72b3..2674aeec3c14a 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1093,10 +1093,13 @@ LogicalResult GlobalTransposeLoadOp::verify() {
size_t numElements = resultType.getNumElements();
size_t elementTypeSize = resultType.getElementType().getIntOrFloatBitWidth();
- // ElementSize -> NumElements (matches ISA-documented global_load_tr variants)
+ // ElementSize -> NumElements. Chipset gating (gfx1200 vs gfx1250) is
+ // enforced in the lowering.
const llvm::SmallDenseMap<size_t, size_t> kValidLoadSizeMap = {
- {8, 8}, // global_load_tr_b64
- {16, 8}, // global_load_tr_b128
+ {4, 16}, // global_load_tr4_b64 (gfx1250+)
+ {6, 16}, // global_load_tr6_b96 (gfx1250+)
+ {8, 8}, // global_load_tr_b64 (gfx1200+)
+ {16, 8}, // global_load_tr_b128 (gfx1200+)
};
auto validNumElems = kValidLoadSizeMap.find(elementTypeSize);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
index 80eab3ac8c971..bf81180d35612 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global_transpose_load.mlir
@@ -1,4 +1,5 @@
// RUN: mlir-opt %s --split-input-file --verify-diagnostics -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s
+// RUN: mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s --check-prefixes=CHECK,CHECK-GFX1250
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx942 2>&1 | FileCheck %s --check-prefix=CHECK-OLD
// CHECK-LABEL: func @global_transpose_load_8xf16
@@ -27,30 +28,32 @@ func.func @global_transpose_load_8xi8(%i : index, %j : index,
// -----
-func.func @global_transpose_load_wrong_addrspace(%i : index, %j : index,
- %src : memref<128x256xf16, 3>) -> vector<8xf16> {
- // expected-error at +1 {{'amdgpu.global_transpose_load' op source memory address space must be Global}}
- %0 = amdgpu.global_transpose_load %src[%i, %j]
- : memref<128x256xf16, 3> -> vector<8xf16>
- return %0 : vector<8xf16>
-}
-
-// -----
-
-func.func @global_transpose_load_unsupported_f32(%i : index, %j : index,
- %src : memref<128x256xf32, #gpu.address_space<global>>) -> vector<8xf32> {
- // expected-error at +1 {{'amdgpu.global_transpose_load' op unsupported element type size for global transpose load: 32 bits}}
+// CHECK-GFX1250-LABEL: func @global_transpose_load_16xi4
+func.func @global_transpose_load_16xi4(%i : index, %j : index,
+ %src : memref<128x32xi8, #gpu.address_space<global>>) -> vector<16xi4> {
+ // CHECK-GFX1250: %[[RES:.*]] = rocdl.global.load.tr4.b64
+ // CHECK-GFX1250-SAME: -> vector<2xi32>
+ // CHECK-GFX1250-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<16xi4>
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1200+
+ // expected-error at +2 {{'amdgpu.global_transpose_load' op 4-bit global_transpose_load requires gfx1250+}}
+ // expected-error at +1 {{failed to legalize operation 'amdgpu.global_transpose_load'}}
%0 = amdgpu.global_transpose_load %src[%i, %j]
- : memref<128x256xf32, #gpu.address_space<global>> -> vector<8xf32>
- return %0 : vector<8xf32>
+ : memref<128x32xi8, #gpu.address_space<global>> -> vector<16xi4>
+ return %0 : vector<16xi4>
}
// -----
-func.func @global_transpose_load_wrong_num_elements(%i : index, %j : index,
- %src : memref<128x256xf16, #gpu.address_space<global>>) -> vector<4xf16> {
- // expected-error at +1 {{'amdgpu.global_transpose_load' op transferring type size mismatch: expected num of elements: 8}}
+// CHECK-GFX1250-LABEL: func @global_transpose_load_16xi6
+func.func @global_transpose_load_16xi6(%i : index, %j : index,
+ %src : memref<128x32xi8, #gpu.address_space<global>>) -> vector<16xi6> {
+ // CHECK-GFX1250: %[[RES:.*]] = rocdl.global.load.tr6.b96
+ // CHECK-GFX1250-SAME: -> vector<3xi32>
+ // CHECK-GFX1250-NEXT: llvm.bitcast %[[RES]] : vector<3xi32> to vector<16xi6>
+ // CHECK-OLD: error: 'amdgpu.global_transpose_load' op global_transpose_load is only supported on gfx1200+
+ // expected-error at +2 {{'amdgpu.global_transpose_load' op 6-bit global_transpose_load requires gfx1250+}}
+ // expected-error at +1 {{failed to legalize operation 'amdgpu.global_transpose_load'}}
%0 = amdgpu.global_transpose_load %src[%i, %j]
- : memref<128x256xf16, #gpu.address_space<global>> -> vector<4xf16>
- return %0 : vector<4xf16>
+ : memref<128x32xi8, #gpu.address_space<global>> -> vector<16xi6>
+ return %0 : vector<16xi6>
}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 2958b0fe2bc51..4fe3185a27bd1 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -317,6 +317,16 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
// -----
+func.func @global_transpose_load_wrong_addrspace(%i : index, %j : index,
+ %src : memref<128x256xf16, 3>) -> vector<8xf16> {
+ // expected-error at +1 {{'amdgpu.global_transpose_load' op source memory address space must be Global}}
+ %0 = amdgpu.global_transpose_load %src[%i, %j]
+ : memref<128x256xf16, 3> -> vector<8xf16>
+ func.return %0 : vector<8xf16>
+}
+
+// -----
+
func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
// expected-error at +1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 606a7768974bf..b05627d6ee967 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -675,6 +675,15 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
func.return %0 : vector<4xf16>
}
+// CHECK-LABEL: func @global_transpose_load
+func.func @global_transpose_load(%i : index, %j : index,
+ %mem : memref<128x256xf16, #gpu.address_space<global>>) -> vector<8xf16> {
+ // CHECK: amdgpu.global_transpose_load
+ %0 = amdgpu.global_transpose_load %mem[%i, %j]
+ : memref<128x256xf16, #gpu.address_space<global>> -> vector<8xf16>
+ func.return %0 : vector<8xf16>
+}
+
// CHECK-LABEL: func @gather_to_lds
func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>, %smem3 : memref<?x?xf16, strided<[?, 1]>, #gpu.address_space<workgroup>>) {
// CHECK: amdgpu.gather_to_lds async %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
More information about the Mlir-commits
mailing list