[Mlir-commits] [mlir] [AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads. (PR #145395)
Alan Li
llvmlistbot at llvm.org
Tue Jun 24 19:51:19 PDT 2025
================
@@ -1100,6 +1100,81 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
}
};
+struct TransposeLoadOpLowering
+ : public ConvertOpToLLVMPattern<TransposeLoadOp> {
+ TransposeLoadOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<TransposeLoadOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(TransposeLoadOp op, TransposeLoadOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset != kGfx950)
+ return op.emitOpError("Non-gfx950 chipset not supported");
+
+ Location loc = op.getLoc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+
+ // Elements in subbyte memrefs are stored non-contiguously,
+ // reject if source is sub-byte memref. Use emulated memrefs instead.
+ size_t srcElementSize =
+ srcMemRefType.getElementType().getIntOrFloatBitWidth();
+ if (srcElementSize < 8)
+ return op.emitOpError("Expect source memref to have at least 8 bits "
+ "element size, got ")
+ << srcElementSize;
+
+ auto resultType = cast<VectorType>(op.getResult().getType());
+ Value srcPtr =
+ getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
+ (adaptor.getSrcIndices()));
+
+ size_t numElements = resultType.getNumElements();
+ size_t elementTypeSize =
+ resultType.getElementType().getIntOrFloatBitWidth();
+
+ // ROCDL transpose load intrinsics return vectors of 32-bit integers, if
+ // the element size is smaller than 16 bits.
+ Type rocdlResultType = VectorType::get((numElements * elementTypeSize) / 32,
+ rewriter.getIntegerType(32));
+ Type llvmResultType = typeConverter->convertType(resultType);
----------------
lialan wrote:
hmm, I am not following what this comment is about ... ? @krzysz00
https://github.com/llvm/llvm-project/pull/145395
More information about the Mlir-commits
mailing list