[Mlir-commits] [mlir] [ROCDL][WIP] Added matrix load-transpose ops for gfx1250+ (PR #165564)

Thu Oct 30 11:23:36 PDT 2025

================
@@ -650,6 +649,76 @@ def ROCDL_ds_read_tr8_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr8.b64">;
 def ROCDL_ds_read_tr6_b96 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr6.b96">;
 def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">;
 
+
+
+//===---------------------------------------------------------------------===//
+// Glb/DS load-transpose intrinsics (available in GFX1250+)
+
+class WrapperType<Type t, int w> {
+  Type type = t;
+  int bitwidth = w;
+}
+class IType<I t> : WrapperType<t, t.bitwidth> {}
+class FType<F t> : WrapperType<t, t.bitwidth> {}
+def BF16Type : WrapperType<BF16, 16> {}
+
+
+class AddrKind<string n, int s> {
+  string name = n;
+  int space = s;
+  LLVM_PointerInAddressSpace type = LLVM_PointerInAddressSpace<s>;
+}
+def GlobalAddrKind : AddrKind<"global", 1>;
+def DSAddrKind : AddrKind<"ds", 3>;
+
+class ROCDL_TrLoadOpMeta<AddrKind addKind, int inElemBits, int outElemBits, WrapperType outElemType> {
+  string inBits = !cast<string>(inElemBits);
+  string outBits = !cast<string>(outElemBits);
+  LLVM_PointerInAddressSpace inType = addKind.type;
+  int outNumElem = !div(outElemBits, outElemType.bitwidth);
+  ROCDL_ConcreteVector outType = ROCDL_ConcreteVector<outElemType.type, outNumElem>;
+  string inBitsEnc = !if(!eq(addKind.space, 1),
+                     !if(!eq(inElemBits, 8),
+                     !if(!eq(inElemBits, 16), "", inBits), inBits), inBits);
+  string mnemonic = addKind.name # ".load.tr" # inBitsEnc # ".b" # outBits;
+}
+
+class ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta meta> :
+  ROCDL_IntrOp<meta.mnemonic, [1], [], [], 1, 0, 1> {
+
+  dag args = (ins Arg<meta.inType, "", [MemRead]>:$ptr);
+  let arguments = !con(args, baseArgs);
+  let results = (outs meta.outType:$res);
+  let summary = "Loads and transposes a matrix from global memory or ds to registers (available in gfx1250+).";
+  let description = [{
+    Load a matrix of }] # meta.inBits # [{-bit data from the global memory,
+    transpose data between row-major and column-major order,
+    and store the result into a }] # meta.outBits # [{-bit vector register.
+
+    Available in gfx1250+.
+  }];
+  let assemblyFormat = "$ptr attr-dict `:` type($ptr) `->` type($res)";
+  let extraClassDefinition = [{
+    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+      return {getPtr()};
+    }
+  }];
+}
+
+def ROCDL_GlobalLoadTr4_2I32 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 4, 64, IType<I32>>>;
+def ROCDL_GlobalLoadTr8_2I32 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 8, 64, IType<I32>>>;
+def ROCDL_GlobalLoadTr6_3I32 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 6, 96, IType<I32>>>;
+def ROCDL_GlobalLoadTr8_8I16 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 16, 128, IType<I16>>>;
+//def ROCDL_GlobalLoadTr8_8F16 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 8, 128, FType<F16>>>;
+//def ROCDL_GlobalLoadTr8_8BF16 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 8, 128, BF16Type>>;
----------------
krzysz00 wrote:

Strong reject of having "f16" and "bf16" and so on variants.

Just make it variadic.

https://github.com/llvm/llvm-project/pull/165564