[Mlir-commits] [mlir] [MLIR][AMDGPU] Add amdgpu.global_transpose_load op for RDNA4 global memory transpose loads (PR #195287)
Nirvedh Meshram
llvmlistbot at llvm.org
Fri May 1 10:13:15 PDT 2026
================
@@ -2226,6 +2226,66 @@ struct TransposeLoadOpLowering
}
};
+struct GlobalTransposeLoadOpLowering
+ : public ConvertOpToLLVMPattern<GlobalTransposeLoadOp> {
+ GlobalTransposeLoadOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<GlobalTransposeLoadOp>(converter),
+ chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(GlobalTransposeLoadOp op,
+ GlobalTransposeLoadOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op.emitOpError(
+ "global_transpose_load is only supported on gfx1250+");
+
+ Location loc = op.getLoc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+ auto resultType = cast<VectorType>(op.getResult().getType());
+
+ Value srcPtr =
+ getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
+ adaptor.getSrcIndices());
+
+ size_t numElements = resultType.getNumElements();
+ size_t elementTypeSize =
+ resultType.getElementType().getIntOrFloatBitWidth();
+
+ // ROCDL global transpose load intrinsics return vectors of i32 for
+ // sub-16-bit elements, matching the LDS lowering convention.
+ Type rocdlResultType =
+ elementTypeSize < 16
+ ? VectorType::get((numElements * elementTypeSize) / 32,
+ rewriter.getIntegerType(32))
+ : typeConverter->convertType(resultType);
+ Type llvmResultType = typeConverter->convertType(resultType);
+
+ switch (elementTypeSize) {
+ case 8: {
----------------
nirvedhmeshram wrote:
I added it and then removed it in second commit becuase I dont see them in this ISA
https://docs.amd.com/v/u/en-US/rdna4-instruction-set-architecture, maybe better as a future commit for some archs that do have it?
https://github.com/llvm/llvm-project/pull/195287
More information about the Mlir-commits
mailing list