[Mlir-commits] [mlir] [AMDGPU] Implement amdgpu.dot op (PR #193371)
Eric Feng
llvmlistbot at llvm.org
Wed Apr 22 22:23:31 PDT 2026
https://github.com/efric updated https://github.com/llvm/llvm-project/pull/193371
>From 2f07948ae1c912f123cac06e4a2d5ac473579a14 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:01:48 -0700
Subject: [PATCH 1/7] amdgpu dot ops
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 58 ++++++
.../mlir/Dialect/AMDGPU/Utils/Chipset.h | 58 +++++-
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 173 +++++++++++++++++-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 50 +++++
.../Conversion/AMDGPUToROCDL/dot-gfx11.mlir | 62 +++++++
.../Conversion/AMDGPUToROCDL/dot-gfx12.mlir | 31 ++++
.../Conversion/AMDGPUToROCDL/dot-gfx9.mlir | 65 +++++++
.../Conversion/AMDGPUToROCDL/dot-invalid.mlir | 41 +++++
mlir/test/Dialect/AMDGPU/invalid.mlir | 99 ++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 74 ++++++++
.../Dialect/AMDGPU/AMDGPUUtilsTest.cpp | 84 +++++++++
11 files changed, 786 insertions(+), 9 deletions(-)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index f8d4a3fdadf6b..1afc941332131 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1091,6 +1091,64 @@ def AMDGPU_WMMAOp :
let hasVerifier = 1;
}
+def DotInTypes : AnyTypeOf<[VectorOfLengthAndType<[2], [F16, BF16, I16]>,
+ VectorOfLengthAndType<[4], [I8, F8E4M3FN, F8E5M2]>,
+ VectorOfLengthAndType<[8], [I<4>]>]>;
+
+def DotOutTypes : AnyTypeOf<[F32, F16, BF16, I32]>;
+
+def AMDGPU_DotOp :
+ AMDGPU_Op<"dot", [AllTypesMatch<["destC", "destD"]>, Pure]>,
+ Arguments<(ins DotInTypes:$sourceA,
+ DotInTypes:$sourceB,
+ DotOutTypes:$destC,
+ UnitAttr:$unsignedA,
+ UnitAttr:$unsignedB,
+ UnitAttr:$clamp)>,
+ Results<(outs DotOutTypes:$destD)> {
+ let summary = "MLIR wrapper for AMDGPU v_dot* intrinsics";
+ let description = [{
+ The `amdgpu.dot` op is an MLIR wrapper over the `v_dot*` family of intrinsics,
+ which compute `D = sum_i A[i] * B[i] + C`.
+
+ Variants (source, dest, signedness, chipset -> intrinsic). For the precise
+ per-feature chipset enablement, see `hasDot{N}Insts` in `Chipset.h`.
+
+ ```text
+ | A elem | B elem | destC | signedness | chipset | ROCDL op |
+ |----------|----------|-------|------------|---------------------------|------------------------------|
+ | f16 | f16 | f32 | n/a | gfx906+ | fdot2 |
+ | f16 | f16 | f16 | n/a | gfx11+ | fdot2.f16.f16 |
+ | bf16 | bf16 | f32 | n/a | gfx11+, gfx950+ | fdot2.f32.bf16 |
+ | bf16 | bf16 | bf16 | n/a | gfx11+ | fdot2.bf16.bf16 |
+ | i16 | i16 | i32 | s / u | gfx906+, no gfx11+/gfx12+ | sdot2 / udot2 |
+ | i8 | i8 | i32 | s / u | gfx906+ | sdot4 / udot4 |
+ | i8 | i8 | i32 | mixed | gfx11+ | sudot4 |
+ | i4 | i4 | i32 | s / u | gfx906+ | sdot8 / udot8 |
+ | i4 | i4 | i32 | mixed | gfx11+ | sudot8 |
+ | fp8/bf8 | fp8/bf8 | f32 | n/a | gfx11.7, gfx12+ | dot4.f32.{fp8,bf8}.{fp8,bf8} |
+ ```
+
+ Example:
+ ```mlir
+ %r0 = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+ %r1 = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp}
+ : vector<8xi4>, vector<8xi4>, i32
+ %r2 = amdgpu.dot %a * %b + %c {unsignedB}
+ : vector<4xi8>, vector<4xi8>, i32
+ %r3 = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f32
+ %r4 = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+ %r5 = amdgpu.dot %a * %b + %c
+ : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+ ```
+ }];
+ let assemblyFormat = [{
+ $sourceA `*` $sourceB `+` $destC attr-dict
+ `:` type($sourceA) `,` type($sourceB) `,` type($destC)
+ }];
+ let hasVerifier = 1;
+}
+
def AMDGPU_SparseMFMAOp :
AMDGPU_Op<"sparse_mfma", [AllTypesMatch<["destC", "destD"]>,
Pure]>,
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
index ca9809799588c..3dab1eba9b526 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
@@ -15,10 +15,10 @@ namespace mlir::amdgpu {
/// Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
/// Note that the leading digits form a decimal number, while the last two
-/// digits for a hexadecimal number. For example:
+/// digits form a hexadecimal number. For example:
/// gfx942 --> major = 9, minor = 0x4, stepping = 0x2
/// gfx90a --> major = 9, minor = 0x0, stepping = 0xa
-/// gfx1103 --> major = 10, minor = 0x0, stepping = 0x3
+/// gfx1103 --> major = 11, minor = 0x0, stepping = 0x3
struct Chipset {
unsigned majorVersion = 0; // The major version (decimal).
unsigned minorVersion = 0; // The minor version (hexadecimal).
@@ -54,6 +54,60 @@ inline bool hasOcpFp8(const Chipset &chipset) {
chipset.majorVersion >= 12;
}
+// Predicates mirroring the LLVM AMDGPU `HasDot{N}Insts` features that gate
+// the `v_dot*` instructions consumed by the `amdgpu.dot` lowering.
+
+inline bool hasDot1Insts(const Chipset &chipset) {
+ if (chipset.majorVersion == 9)
+ return chipset >= Chipset(9, 0, 6);
+ if (chipset.majorVersion == 10) {
+ if (chipset.minorVersion == 1)
+ return chipset.steppingVersion == 1u || chipset.steppingVersion == 2u;
+ return chipset.minorVersion >= 3u;
+ }
+ return false;
+}
+
+inline bool hasDot2Insts(const Chipset &chipset) {
+ return hasDot1Insts(chipset);
+}
+
+inline bool hasDot7Insts(const Chipset &chipset) {
+ return chipset.majorVersion >= 11 || hasDot1Insts(chipset);
+}
+
+inline bool hasDot8Insts(const Chipset &chipset) {
+ return chipset.majorVersion >= 11;
+}
+
+inline bool hasDot9Insts(const Chipset &chipset) {
+ if (chipset.majorVersion == 11)
+ return true;
+ return chipset.majorVersion == 12 && chipset.minorVersion == 0;
+}
+
+inline bool hasDot10Insts(const Chipset &chipset) {
+ if (chipset.majorVersion == 11)
+ return true;
+ if (chipset.majorVersion == 12)
+ return chipset.minorVersion == 0;
+ return hasDot1Insts(chipset);
+}
+
+inline bool hasDot11Insts(const Chipset &chipset) {
+ if (chipset.majorVersion == 11)
+ return chipset.minorVersion == 7u;
+ return chipset.majorVersion == 12 && chipset.minorVersion == 0;
+}
+
+inline bool hasDot12Insts(const Chipset &chipset) {
+ if (chipset == Chipset(9, 5, 0))
+ return true;
+ if (chipset.majorVersion == 11)
+ return true;
+ return chipset.majorVersion == 12 && chipset.minorVersion == 0;
+}
+
} // namespace mlir::amdgpu
#endif
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 14d99c250c0b6..a161d0da29aee 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -685,7 +685,7 @@ static Value packSmallFloatVectorOperand(ConversionPatternRewriter &rewriter,
/// Converts sparse MFMA/WMMA (smfmac/swmmac) operands to the expected ROCDL
/// types.
-static Value convertSparseVectorOperand(ConversionPatternRewriter &rewriter,
+static Value convertPackedVectorOperand(ConversionPatternRewriter &rewriter,
Location loc, Value input,
bool allowBf16 = true) {
Type inputType = input.getType();
@@ -1646,9 +1646,9 @@ struct SparseMFMAOpLowering : public ConvertOpToLLVMPattern<SparseMFMAOp> {
return op->emitOpError("sparse MFMA (smfmac) only supported on gfx942+");
bool isGfx950 = chipset >= kGfx950;
- Value a = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceA(),
+ Value a = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceA(),
isGfx950);
- Value b = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceB(),
+ Value b = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceB(),
isGfx950);
Value c = adaptor.getDestC();
@@ -1753,6 +1753,165 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
}
};
+enum class DotFamily {
+ /// ROCDL_Dot_IntrOp: single `clamp` attribute.
+ Clamp,
+ /// ROCDL_Dot_NoClamp_IntrOp: no attributes.
+ NoClamp,
+ /// ROCDL_Sudot_IntrOp: `signA`, `signB`, and `clamp` attributes.
+ Sudot,
+};
+
+static std::optional<std::pair<StringRef, DotFamily>>
+dotOpToIntrinsic(DotOp op, Chipset chipset) {
+ Type aElem = cast<VectorType>(op.getSourceA().getType()).getElementType();
+ Type bElem = cast<VectorType>(op.getSourceB().getType()).getElementType();
+ Type dest = op.getDestC().getType();
+ bool uA = op.getUnsignedA();
+ bool uB = op.getUnsignedB();
+
+ // f16 x f16 -> f32 / f16.
+ if (aElem.isF16() && bElem.isF16()) {
+ if (dest.isF32() && hasDot10Insts(chipset))
+ return {{ROCDL::fdot2::getOperationName(), DotFamily::Clamp}};
+ if (dest.isF16() && hasDot9Insts(chipset))
+ return {{ROCDL::fdot2_f16_f16::getOperationName(), DotFamily::NoClamp}};
+ return std::nullopt;
+ }
+
+ // bf16 x bf16 -> f32 / bf16.
+ if (aElem.isBF16() && bElem.isBF16()) {
+ if (dest.isF32() && hasDot12Insts(chipset))
+ return {{ROCDL::fdot2_f32_bf16::getOperationName(), DotFamily::Clamp}};
+ if (dest.isBF16() && hasDot9Insts(chipset))
+ return {{ROCDL::fdot2_bf16_bf16::getOperationName(),
+ DotFamily::NoClamp}};
+ return std::nullopt;
+ }
+
+ // Integer sources -> i32.
+ if (isa<IntegerType>(aElem) && isa<IntegerType>(bElem) &&
+ dest.isInteger(32)) {
+ bool mixedSign = (uA != uB);
+ unsigned elemWidth = aElem.getIntOrFloatBitWidth();
+
+ if (mixedSign) {
+ if (!hasDot8Insts(chipset))
+ return std::nullopt;
+ StringRef name;
+ switch (elemWidth) {
+ case 8:
+ name = ROCDL::sudot4::getOperationName();
+ break;
+ case 4:
+ name = ROCDL::sudot8::getOperationName();
+ break;
+ default:
+ return std::nullopt;
+ }
+ return {{name, DotFamily::Sudot}};
+ }
+
+ StringRef name;
+ bool supported = false;
+ switch (elemWidth) {
+ case 16:
+ supported = hasDot2Insts(chipset);
+ name = uA ? ROCDL::udot2::getOperationName()
+ : ROCDL::sdot2::getOperationName();
+ break;
+ case 8:
+ supported = uA ? hasDot7Insts(chipset)
+ : hasDot1Insts(chipset) || hasDot8Insts(chipset);
+ name = uA ? ROCDL::udot4::getOperationName()
+ : ROCDL::sdot4::getOperationName();
+ break;
+ case 4:
+ supported = uA ? hasDot7Insts(chipset)
+ : hasDot1Insts(chipset) || hasDot8Insts(chipset);
+ name = uA ? ROCDL::udot8::getOperationName()
+ : ROCDL::sdot8::getOperationName();
+ break;
+ default:
+ return std::nullopt;
+ }
+ if (!supported)
+ return std::nullopt;
+ return {{name, DotFamily::Clamp}};
+ }
+
+ // fp8/bf8 x fp8/bf8 -> f32.
+ bool aIsFp8 = isa<Float8E4M3FNType>(aElem);
+ bool aIsBf8 = isa<Float8E5M2Type>(aElem);
+ bool bIsFp8 = isa<Float8E4M3FNType>(bElem);
+ bool bIsBf8 = isa<Float8E5M2Type>(bElem);
+ if ((aIsFp8 || aIsBf8) && (bIsFp8 || bIsBf8) && dest.isF32()) {
+ if (!hasDot11Insts(chipset))
+ return std::nullopt;
+ StringRef name;
+ if (aIsFp8 && bIsFp8)
+ name = ROCDL::dot4_f32_fp8_fp8::getOperationName();
+ else if (aIsFp8 && bIsBf8)
+ name = ROCDL::dot4_f32_fp8_bf8::getOperationName();
+ else if (aIsBf8 && bIsFp8)
+ name = ROCDL::dot4_f32_bf8_fp8::getOperationName();
+ else
+ name = ROCDL::dot4_f32_bf8_bf8::getOperationName();
+ return {{name, DotFamily::NoClamp}};
+ }
+
+ return std::nullopt;
+}
+
+struct DotOpLowering : public ConvertOpToLLVMPattern<DotOp> {
+ DotOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<DotOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(DotOp op, DotOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ std::optional<std::pair<StringRef, DotFamily>> maybeIntrinsic =
+ dotOpToIntrinsic(op, chipset);
+ if (!maybeIntrinsic)
+ return op.emitOpError(
+ "no intrinsic matching dot on the given chipset: ")
+ << op.getSourceA().getType() << " * " << op.getSourceB().getType()
+ << " + " << op.getDestC().getType();
+
+ auto [intrinsicName, family] = maybeIntrinsic.value();
+
+ Value a = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceA());
+ Value b = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceB());
+ Value c = adaptor.getDestC();
+
+ SmallVector<NamedAttribute, 3> attrs;
+ if (family == DotFamily::Sudot) {
+ attrs.push_back(rewriter.getNamedAttr(
+ "signA", rewriter.getBoolAttr(!op.getUnsignedA())));
+ attrs.push_back(rewriter.getNamedAttr(
+ "signB", rewriter.getBoolAttr(!op.getUnsignedB())));
+ }
+
+ if (family != DotFamily::NoClamp && op.getClamp())
+ attrs.push_back(
+ rewriter.getNamedAttr("clamp", rewriter.getBoolAttr(true)));
+
+ Type resultType = typeConverter->convertType(op.getDestD().getType());
+
+ OperationState loweredOp(loc, intrinsicName);
+ loweredOp.addTypes(resultType);
+ loweredOp.addOperands({a, b, c});
+ loweredOp.addAttributes(attrs);
+ Operation *lowered = rewriter.create(loweredOp);
+ rewriter.replaceOp(op, lowered->getResults());
+ return success();
+ }
+};
+
struct SparseWMMAOpLowering : public ConvertOpToLLVMPattern<SparseWMMAOp> {
SparseWMMAOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
: ConvertOpToLLVMPattern<SparseWMMAOp>(converter), chipset(chipset) {}
@@ -1803,14 +1962,14 @@ struct SparseWMMAOpLowering : public ConvertOpToLLVMPattern<SparseWMMAOp> {
const bool isGFX1250orHigher =
chipset.majorVersion == 12 && chipset.minorVersion >= 5;
- Value a = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceA(),
+ Value a = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceA(),
isGFX1250orHigher);
- Value b = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceB(),
+ Value b = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceB(),
isGFX1250orHigher);
Value c = adaptor.getDestC();
VectorType rawOutType = outType;
if (!isGFX1250orHigher) {
- c = convertSparseVectorOperand(rewriter, loc, adaptor.getDestC(), false);
+ c = convertPackedVectorOperand(rewriter, loc, adaptor.getDestC(), false);
rawOutType = cast<VectorType>(c.getType());
}
@@ -4191,7 +4350,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
SparseMFMAOpLowering, WMMAOpLowering, ScaledWMMAOpLowering,
- SparseWMMAOpLowering, ExtPackedFp8OpLowering,
+ SparseWMMAOpLowering, DotOpLowering, ExtPackedFp8OpLowering,
ScaledExtPackedMatrixOpLowering, ScaledExtPackedOpLowering,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index d4811275b6fd6..f19b0f3f9edf5 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -741,6 +741,56 @@ LogicalResult SparseWMMAOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// DotOp
+//===----------------------------------------------------------------------===//
+LogicalResult DotOp::verify() {
+ Type aElem = cast<VectorType>(getSourceA().getType()).getElementType();
+ Type bElem = cast<VectorType>(getSourceB().getType()).getElementType();
+ Type dest = getDestC().getType();
+
+ bool aIsFloat8 = aElem.isFloat(8);
+ bool bIsFloat8 = bElem.isFloat(8);
+ bool aIsInteger = isa<IntegerType>(aElem);
+
+ bool bothFloat8 = aIsFloat8 && bIsFloat8;
+ if (!bothFloat8 && aElem != bElem)
+ return emitOpError(
+ "expected source operands to have the same element type");
+
+ if (aElem.isF16()) {
+ if (!dest.isF32() && !dest.isF16())
+ return emitOpError("expected f32 or f16 accumulator for f16 sources");
+ } else if (aElem.isBF16()) {
+ if (!dest.isF32() && !dest.isBF16())
+ return emitOpError("expected f32 or bf16 accumulator for bf16 sources");
+ } else if (aIsInteger) {
+ if (!dest.isInteger(32))
+ return emitOpError("expected i32 accumulator for integer sources");
+ } else if (aIsFloat8) {
+ if (!dest.isF32())
+ return emitOpError("expected f32 accumulator for fp8 sources");
+ }
+
+ if ((getUnsignedA() || getUnsignedB()) && !aIsInteger)
+ return emitOpError(
+ "unsignedA/unsignedB are only valid for integer source types");
+
+ if (aElem.isInteger(16) && getUnsignedA() != getUnsignedB())
+ return emitOpError(
+ "mixed-sign dot is not supported for 16-bit integer sources");
+
+ if (getClamp()) {
+ bool noClamp = (aElem.isF16() && dest.isF16()) ||
+ (aElem.isBF16() && dest.isBF16()) || aIsFloat8;
+ if (noClamp)
+ return emitOpError(
+ "clamp is not supported for this (source, accumulator) combination");
+ }
+
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// DPPOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
new file mode 100644
index 0000000000000..51d553748ad6a
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s
+
+// Variants first available on gfx11: fdot2 variants with narrower
+// accumulators, fdot2.f32.bf16, and the mixed-sign sudot* ops.
+
+// CHECK-LABEL: @dot_fdot2_f16_f16
+func.func @dot_fdot2_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+ // CHECK: rocdl.fdot2.f16.f16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f16) -> f16
+ %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+ func.return %r : f16
+}
+
+// CHECK-LABEL: @dot_fdot2_bf16_bf16
+func.func @dot_fdot2_bf16_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: bf16) -> bf16 {
+ // CHECK: rocdl.fdot2.bf16.bf16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+ %r = amdgpu.dot %a * %b + %c : vector<2xbf16>, vector<2xbf16>, bf16
+ func.return %r : bf16
+}
+
+// CHECK-LABEL: @dot_fdot2_f32_bf16
+func.func @dot_fdot2_f32_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+ // CHECK: rocdl.fdot2.f32.bf16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c : vector<2xbf16>, vector<2xbf16>, f32
+ func.return %r : f32
+}
+
+// Uniform-sign sdot4 still dispatches to the dedicated rocdl.sdot4 (not
+// sudot4) on gfx11+. The backend aliases v_dot4_i32_i8 to v_dot4_i32_iu8
+// at llvm/lib/Target/AMDGPU/VOP3PInstructions.td:2647, so this produces
+// identical machine code to the gfx9 lowering.
+
+// CHECK-LABEL: @dot_sdot4_gfx11_uniform_sign
+func.func @dot_sdot4_gfx11_uniform_sign(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // CHECK: rocdl.sdot4 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
+
+// Mixed-sign i8 dot → rocdl.sudot4.
+
+// CHECK-LABEL: @dot_sudot4_signA_unsignedB
+func.func @dot_sudot4_signA_unsignedB(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // CHECK: rocdl.sudot4 %{{.+}}, %{{.+}}, %{{.+}} {signA = true} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_sudot4_unsignedA_signB_clamp
+func.func @dot_sudot4_unsignedA_signB_clamp(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // CHECK: rocdl.sudot4 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true, signB = true} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c {unsignedA, clamp} : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
+
+// Mixed-sign i4 dot → rocdl.sudot8.
+
+// CHECK-LABEL: @dot_sudot8
+func.func @dot_sudot8(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+ // CHECK: rocdl.sudot8 %{{.+}}, %{{.+}}, %{{.+}} {signA = true} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<8xi4>, vector<8xi4>, i32
+ func.return %r : i32
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir
new file mode 100644
index 0000000000000..3213b5fa8f5c2
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 | FileCheck %s
+
+// CHECK-LABEL: @dot_fp8_fp8
+func.func @dot_fp8_fp8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+ // CHECK: %[[A:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+ // CHECK: %[[B:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+ // CHECK: rocdl.dot4.f32.fp8.fp8 %[[A]], %[[B]], %{{.+}} : (i32, i32, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+ func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_fp8_bf8
+func.func @dot_fp8_bf8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E5M2>, %c: f32) -> f32 {
+ // CHECK: rocdl.dot4.f32.fp8.bf8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+ func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_bf8_fp8
+func.func @dot_bf8_fp8(%a: vector<4xf8E5M2>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+ // CHECK: rocdl.dot4.f32.bf8.fp8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E5M2>, vector<4xf8E4M3FN>, f32
+ func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_bf8_bf8
+func.func @dot_bf8_bf8(%a: vector<4xf8E5M2>, %b: vector<4xf8E5M2>, %c: f32) -> f32 {
+ // CHECK: rocdl.dot4.f32.bf8.bf8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+ func.return %r : f32
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
new file mode 100644
index 0000000000000..c3b4d459992a7
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx906 | FileCheck %s
+
+// Dot variants available from gfx906. Integer sources arrive unpacked;
+// vector<4xi8> and vector<8xi4> are bitcast to scalar i32 (little-endian
+// lane order) before being passed to ROCDL.
+
+// CHECK-LABEL: @dot_fdot2
+func.func @dot_fdot2(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+ // CHECK: rocdl.fdot2 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f32
+ func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_fdot2_clamp
+func.func @dot_fdot2_clamp(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+ // CHECK: rocdl.fdot2 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+ %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xf16>, vector<2xf16>, f32
+ func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_sdot2
+func.func @dot_sdot2(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+ // CHECK: rocdl.sdot2 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c : vector<2xi16>, vector<2xi16>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_udot2_clamp
+func.func @dot_udot2_clamp(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+ // CHECK: rocdl.udot2 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<2xi16>, vector<2xi16>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_sdot4
+func.func @dot_sdot4(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // CHECK: %[[A:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+ // CHECK: %[[B:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+ // CHECK: rocdl.sdot4 %[[A]], %[[B]], %{{.+}} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_udot4_clamp
+func.func @dot_udot4_clamp(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // CHECK: rocdl.udot4 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_sdot8
+func.func @dot_sdot8(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+ // CHECK: %[[A:.+]] = llvm.bitcast %{{.+}} : vector<8xi4> to i32
+ // CHECK: %[[B:.+]] = llvm.bitcast %{{.+}} : vector<8xi4> to i32
+ // CHECK: rocdl.sdot8 %[[A]], %[[B]], %{{.+}} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c : vector<8xi4>, vector<8xi4>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_udot8
+func.func @dot_udot8(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+ // CHECK: rocdl.udot8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, i32) -> i32
+ %r = amdgpu.dot %a * %b + %c {unsignedA, unsignedB} : vector<8xi4>, vector<8xi4>, i32
+ func.return %r : i32
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
new file mode 100644
index 0000000000000..6265f95b4abd4
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx906 --split-input-file -verify-diagnostics
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx942 --split-input-file -verify-diagnostics
+
+// fp8 dot4 is only available on gfx12+.
+func.func @dot_fp8_requires_gfx12(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+ // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+ // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// fdot2.f16.f16 (f16 accumulator for f16 x f16) requires gfx11+.
+func.func @dot_f16_f16_requires_gfx11(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+ // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+ // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+ %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+ func.return %r : f16
+}
+
+// -----
+
+// fdot2.f32.bf16 is available on gfx11+ and gfx950 (ROCDLOps.td:1532);
+// neither gfx906 nor gfx942 supports it.
+func.func @dot_f32_bf16_requires_gfx11_or_gfx950(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+ // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+ // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+ %r = amdgpu.dot %a * %b + %c : vector<2xbf16>, vector<2xbf16>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// Mixed-sign integer dot (sudot) requires gfx11+.
+func.func @dot_mixed_sign_requires_gfx11(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+ // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+ %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index d7d449bd8a579..4f59ec642171f 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -777,3 +777,102 @@ func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, #gpu.ad
amdgpu.global_prefetch %src[%i, %j] NT_HT DEV : memref<64x64xf16, #gpu.address_space<global>>
func.return
}
+
+// -----
+
+// DotOp: unsignedA is invalid on a float source.
+func.func @dot_float_source_unsigned_a(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op unsignedA/unsignedB are only valid for integer source types}}
+ %r = amdgpu.dot %a * %b + %c {unsignedA} : vector<2xf16>, vector<2xf16>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: unsignedB is invalid on a float source.
+func.func @dot_float_source_unsigned_b(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op unsignedA/unsignedB are only valid for integer source types}}
+ %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<2xbf16>, vector<2xbf16>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: integer source requires i32 accumulator.
+func.func @dot_integer_bad_accumulator(%a: vector<4xi8>, %b: vector<4xi8>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op expected i32 accumulator for integer sources}}
+ %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: source element types must match for non-fp8 sources.
+func.func @dot_cross_float_elems(%a: vector<2xf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op expected source operands to have the same element type}}
+ %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xbf16>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: fp8 and non-fp8 sources cannot be mixed.
+func.func @dot_fp8_mixed_with_int(%a: vector<4xf8E4M3FN>, %b: vector<4xi8>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op expected source operands to have the same element type}}
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xi8>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: fp8 source requires f32 accumulator.
+func.func @dot_fp8_bad_accumulator(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f16) -> f16 {
+ // expected-error at +1 {{'amdgpu.dot' op expected f32 accumulator for fp8 sources}}
+ %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f16
+ func.return %r : f16
+}
+
+// -----
+
+// DotOp: clamp is illegal for (f16, f16) — no clamp bit in fdot2.f16.f16.
+func.func @dot_clamp_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+ // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+ %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xf16>, vector<2xf16>, f16
+ func.return %r : f16
+}
+
+// -----
+
+// DotOp: clamp is illegal for (bf16, bf16) — no clamp bit in fdot2.bf16.bf16.
+func.func @dot_clamp_bf16_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: bf16) -> bf16 {
+ // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+ %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xbf16>, vector<2xbf16>, bf16
+ func.return %r : bf16
+}
+
+// -----
+
+// DotOp: clamp is illegal for any fp8 variant — no clamp bit in dot4.f32.*.
+func.func @dot_clamp_fp8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+ %r = amdgpu.dot %a * %b + %c {clamp} : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: clamp is illegal for bf8 (F8E5M2) sources as well.
+func.func @dot_clamp_bf8(%a: vector<4xf8E5M2>, %b: vector<4xf8E5M2>, %c: f32) -> f32 {
+ // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+ %r = amdgpu.dot %a * %b + %c {clamp} : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+ func.return %r : f32
+}
+
+// -----
+
+// DotOp: mixed-sign i16 dot has no hardware support (no sudot2 intrinsic).
+func.func @dot_mixed_sign_i16(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+ // expected-error at +1 {{'amdgpu.dot' op mixed-sign dot is not supported for 16-bit integer sources}}
+ %r = amdgpu.dot %a * %b + %c {unsignedA} : vector<2xi16>, vector<2xi16>, i32
+ func.return %r : i32
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 6f4dd486610cc..a34550dc25420 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -865,3 +865,77 @@ func.func @ds_barrier_ops(%barrier: memref<!amdgpu.ds_barrier_state, #gpu.addres
%parity = amdgpu.ds_barrier_state_phase_parity %state : !amdgpu.ds_barrier_state -> i1
func.return
}
+
+// CHECK-LABEL: func @dot_f16_f32
+func.func @dot_f16_f32(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+ // CHECK: amdgpu.dot {{.*}} : vector<2xf16>, vector<2xf16>, f32
+ %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f32
+ // CHECK: amdgpu.dot {{.*}} {clamp} : vector<2xf16>, vector<2xf16>, f32
+ %s = amdgpu.dot %a * %b + %c {clamp} : vector<2xf16>, vector<2xf16>, f32
+ func.return %r : f32
+}
+
+// CHECK-LABEL: func @dot_f16_f16
+func.func @dot_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+ // CHECK: amdgpu.dot {{.*}} : vector<2xf16>, vector<2xf16>, f16
+ %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+ func.return %r : f16
+}
+
+// CHECK-LABEL: func @dot_bf16
+func.func @dot_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32, %d: bf16) {
+ // CHECK: amdgpu.dot {{.*}} {clamp} : vector<2xbf16>, vector<2xbf16>, f32
+ %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xbf16>, vector<2xbf16>, f32
+ // CHECK: amdgpu.dot {{.*}} : vector<2xbf16>, vector<2xbf16>, bf16
+ %s = amdgpu.dot %a * %b + %d : vector<2xbf16>, vector<2xbf16>, bf16
+ func.return
+}
+
+// CHECK-LABEL: func @dot_i16
+func.func @dot_i16(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+ // CHECK: amdgpu.dot {{.*}} : vector<2xi16>, vector<2xi16>, i32
+ %r = amdgpu.dot %a * %b + %c : vector<2xi16>, vector<2xi16>, i32
+ // CHECK: amdgpu.dot {{.*}} {clamp} : vector<2xi16>, vector<2xi16>, i32
+ %s = amdgpu.dot %a * %b + %c {clamp} : vector<2xi16>, vector<2xi16>, i32
+ // CHECK: amdgpu.dot {{.*}} {unsignedA, unsignedB} : vector<2xi16>, vector<2xi16>, i32
+ %t = amdgpu.dot %a * %b + %c {unsignedA, unsignedB} : vector<2xi16>, vector<2xi16>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: func @dot_i8
+func.func @dot_i8(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+ // CHECK: amdgpu.dot {{.*}} : vector<4xi8>, vector<4xi8>, i32
+ %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+ // CHECK: amdgpu.dot {{.*}} {clamp, unsignedA, unsignedB} : vector<4xi8>, vector<4xi8>, i32
+ %s = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<4xi8>, vector<4xi8>, i32
+ // CHECK: amdgpu.dot {{.*}} {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+ %t = amdgpu.dot %a * %b + %c {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+ // CHECK: amdgpu.dot {{.*}} {unsignedA} : vector<4xi8>, vector<4xi8>, i32
+ %u = amdgpu.dot %a * %b + %c {unsignedA} : vector<4xi8>, vector<4xi8>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: func @dot_i4
+func.func @dot_i4(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+ // CHECK: amdgpu.dot {{.*}} : vector<8xi4>, vector<8xi4>, i32
+ %r = amdgpu.dot %a * %b + %c : vector<8xi4>, vector<8xi4>, i32
+ // CHECK: amdgpu.dot {{.*}} {clamp, unsignedA, unsignedB} : vector<8xi4>, vector<8xi4>, i32
+ %s = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<8xi4>, vector<8xi4>, i32
+ // CHECK: amdgpu.dot {{.*}} {unsignedA} : vector<8xi4>, vector<8xi4>, i32
+ %t = amdgpu.dot %a * %b + %c {unsignedA} : vector<8xi4>, vector<8xi4>, i32
+ func.return %r : i32
+}
+
+// CHECK-LABEL: func @dot_fp8
+func.func @dot_fp8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>,
+ %e: vector<4xf8E5M2>, %c: f32) -> f32 {
+ // CHECK: amdgpu.dot {{.*}} : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+ %r0 = amdgpu.dot %a * %a + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+ // CHECK: amdgpu.dot {{.*}} : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+ %r1 = amdgpu.dot %a * %e + %c : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+ // CHECK: amdgpu.dot {{.*}} : vector<4xf8E5M2>, vector<4xf8E4M3FN>, f32
+ %r2 = amdgpu.dot %e * %a + %c : vector<4xf8E5M2>, vector<4xf8E4M3FN>, f32
+ // CHECK: amdgpu.dot {{.*}} : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+ %r3 = amdgpu.dot %e * %e + %c : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+ func.return %r0 : f32
+}
diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
index 570d56f3c6ff1..a70067ca52ce5 100644
--- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
+++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -58,5 +58,89 @@ TEST(ChipsetTest, Comparison) {
EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
}
+TEST(ChipsetTest, HasDot1Insts) {
+ // gfx9: enabled from gfx906 onward.
+ EXPECT_FALSE(hasDot1Insts(Chipset(9, 0, 0)));
+ EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 6)));
+ EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 8)));
+ EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 0xa)));
+ EXPECT_TRUE(hasDot1Insts(Chipset(9, 4, 2)));
+ EXPECT_TRUE(hasDot1Insts(Chipset(9, 5, 0)));
+
+ // gfx10: only gfx10.1.1, gfx10.1.2, and gfx10.3+ enable Dot1.
+ EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 0))); // gfx1010
+ EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 1))); // gfx1011
+ EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 2))); // gfx1012
+ EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 3))); // gfx1013
+ EXPECT_TRUE(hasDot1Insts(Chipset(10, 3, 0))); // gfx1030
+
+ // Not on gfx11+/gfx12+/gfx13+.
+ EXPECT_FALSE(hasDot1Insts(Chipset(11, 0, 0)));
+ EXPECT_FALSE(hasDot1Insts(Chipset(12, 0, 0)));
+ EXPECT_FALSE(hasDot1Insts(Chipset(12, 5, 0)));
+ EXPECT_FALSE(hasDot1Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot7Insts) {
+ // Same as Dot1 plus all of gfx11+/gfx12+/gfx13+.
+ EXPECT_FALSE(hasDot7Insts(Chipset(9, 0, 0)));
+ EXPECT_TRUE(hasDot7Insts(Chipset(9, 0, 6)));
+ EXPECT_FALSE(hasDot7Insts(Chipset(10, 1, 0)));
+ EXPECT_TRUE(hasDot7Insts(Chipset(11, 0, 0)));
+ EXPECT_TRUE(hasDot7Insts(Chipset(12, 0, 0)));
+ EXPECT_TRUE(hasDot7Insts(Chipset(12, 5, 0))); // gfx1250 still has Dot7.
+ EXPECT_TRUE(hasDot7Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot8Insts) {
+ // gfx11+ only.
+ EXPECT_FALSE(hasDot8Insts(Chipset(9, 4, 2)));
+ EXPECT_FALSE(hasDot8Insts(Chipset(10, 3, 0)));
+ EXPECT_TRUE(hasDot8Insts(Chipset(11, 0, 0)));
+ EXPECT_TRUE(hasDot8Insts(Chipset(12, 5, 0))); // gfx1250 has Dot8.
+}
+
+TEST(ChipsetTest, HasDot9Insts) {
+ // gfx11.x and gfx12.0 only.
+ EXPECT_FALSE(hasDot9Insts(Chipset(9, 4, 2)));
+ EXPECT_FALSE(hasDot9Insts(Chipset(10, 3, 0)));
+ EXPECT_TRUE(hasDot9Insts(Chipset(11, 0, 0)));
+ EXPECT_TRUE(hasDot9Insts(Chipset(11, 7, 0)));
+ EXPECT_TRUE(hasDot9Insts(Chipset(12, 0, 0)));
+ EXPECT_FALSE(hasDot9Insts(Chipset(12, 5, 0))); // gfx1250 lacks Dot9.
+ EXPECT_FALSE(hasDot9Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot10Insts) {
+ // Dot1's set plus gfx11.x and gfx12.0 (excludes gfx12.5+/gfx13+).
+ EXPECT_TRUE(hasDot10Insts(Chipset(9, 0, 6)));
+ EXPECT_FALSE(hasDot10Insts(Chipset(10, 1, 0)));
+ EXPECT_TRUE(hasDot10Insts(Chipset(10, 3, 0)));
+ EXPECT_TRUE(hasDot10Insts(Chipset(11, 0, 0)));
+ EXPECT_TRUE(hasDot10Insts(Chipset(12, 0, 0)));
+ EXPECT_FALSE(hasDot10Insts(Chipset(12, 5, 0)));
+ EXPECT_FALSE(hasDot10Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot11Insts) {
+ // Only gfx11.7 and gfx12.0.
+ EXPECT_FALSE(hasDot11Insts(Chipset(9, 5, 0)));
+ EXPECT_FALSE(hasDot11Insts(Chipset(11, 0, 0)));
+ EXPECT_FALSE(hasDot11Insts(Chipset(11, 5, 0)));
+ EXPECT_TRUE(hasDot11Insts(Chipset(11, 7, 0)));
+ EXPECT_TRUE(hasDot11Insts(Chipset(12, 0, 0)));
+ EXPECT_FALSE(hasDot11Insts(Chipset(12, 5, 0)));
+}
+
+TEST(ChipsetTest, HasDot12Insts) {
+ // gfx9.5.0, gfx11.x, and gfx12.0.
+ EXPECT_FALSE(hasDot12Insts(Chipset(9, 0, 6)));
+ EXPECT_FALSE(hasDot12Insts(Chipset(9, 4, 2)));
+ EXPECT_TRUE(hasDot12Insts(Chipset(9, 5, 0)));
+ EXPECT_TRUE(hasDot12Insts(Chipset(11, 0, 0)));
+ EXPECT_TRUE(hasDot12Insts(Chipset(12, 0, 0)));
+ EXPECT_FALSE(hasDot12Insts(Chipset(12, 5, 0)));
+}
+
} // namespace
} // namespace mlir::amdgpu
>From 1c62f4078c0d1b349182f3defa1775fd05fb0180 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:05:56 -0700
Subject: [PATCH 2/7] format
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 2 +-
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ++----
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 1afc941332131..d7de80e008cf5 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1126,7 +1126,7 @@ def AMDGPU_DotOp :
| i8 | i8 | i32 | mixed | gfx11+ | sudot4 |
| i4 | i4 | i32 | s / u | gfx906+ | sdot8 / udot8 |
| i4 | i4 | i32 | mixed | gfx11+ | sudot8 |
- | fp8/bf8 | fp8/bf8 | f32 | n/a | gfx11.7, gfx12+ | dot4.f32.{fp8,bf8}.{fp8,bf8} |
+ | fp8/bf8 | fp8/bf8 | f32 | n/a | gfx11.7, gfx12+ | dot4.f32.{fp8,bf8}.{fp8,bf8} |
```
Example:
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a161d0da29aee..5455a302d4d49 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1784,8 +1784,7 @@ dotOpToIntrinsic(DotOp op, Chipset chipset) {
if (dest.isF32() && hasDot12Insts(chipset))
return {{ROCDL::fdot2_f32_bf16::getOperationName(), DotFamily::Clamp}};
if (dest.isBF16() && hasDot9Insts(chipset))
- return {{ROCDL::fdot2_bf16_bf16::getOperationName(),
- DotFamily::NoClamp}};
+ return {{ROCDL::fdot2_bf16_bf16::getOperationName(), DotFamily::NoClamp}};
return std::nullopt;
}
@@ -1877,8 +1876,7 @@ struct DotOpLowering : public ConvertOpToLLVMPattern<DotOp> {
std::optional<std::pair<StringRef, DotFamily>> maybeIntrinsic =
dotOpToIntrinsic(op, chipset);
if (!maybeIntrinsic)
- return op.emitOpError(
- "no intrinsic matching dot on the given chipset: ")
+ return op.emitOpError("no intrinsic matching dot on the given chipset: ")
<< op.getSourceA().getType() << " * " << op.getSourceB().getType()
<< " + " << op.getDestC().getType();
>From 08860105815d8400fcaaf085ab0992d2b12ccf4e Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:07:50 -0700
Subject: [PATCH 3/7] nits
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir | 3 ---
mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir | 4 ----
2 files changed, 7 deletions(-)
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
index 51d553748ad6a..d5ba51a8a5501 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
@@ -1,8 +1,5 @@
// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s
-// Variants first available on gfx11: fdot2 variants with narrower
-// accumulators, fdot2.f32.bf16, and the mixed-sign sudot* ops.
-
// CHECK-LABEL: @dot_fdot2_f16_f16
func.func @dot_fdot2_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
// CHECK: rocdl.fdot2.f16.f16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f16) -> f16
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
index c3b4d459992a7..b6f3b9e11c717 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
@@ -1,9 +1,5 @@
// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx906 | FileCheck %s
-// Dot variants available from gfx906. Integer sources arrive unpacked;
-// vector<4xi8> and vector<8xi4> are bitcast to scalar i32 (little-endian
-// lane order) before being passed to ROCDL.
-
// CHECK-LABEL: @dot_fdot2
func.func @dot_fdot2(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
// CHECK: rocdl.fdot2 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f32) -> f32
>From b84d8facf8bdf3a5ffa6d6ee206ba6501f280147 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:09:00 -0700
Subject: [PATCH 4/7] nits
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
index 6265f95b4abd4..02519ac0c6dd1 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
@@ -21,8 +21,7 @@ func.func @dot_f16_f16_requires_gfx11(%a: vector<2xf16>, %b: vector<2xf16>, %c:
// -----
-// fdot2.f32.bf16 is available on gfx11+ and gfx950 (ROCDLOps.td:1532);
-// neither gfx906 nor gfx942 supports it.
+// fdot2.f32.bf16 is available on gfx11+ and gfx950+.
func.func @dot_f32_bf16_requires_gfx11_or_gfx950(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
// expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
// expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
>From 4f2707a5feb973cb5b9590ea2b581854bbb5a1d2 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:12:19 -0700
Subject: [PATCH 5/7] nit
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5455a302d4d49..7f4a58e205bb2 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -683,8 +683,7 @@ static Value packSmallFloatVectorOperand(ConversionPatternRewriter &rewriter,
return input;
}
-/// Converts sparse MFMA/WMMA (smfmac/swmmac) operands to the expected ROCDL
-/// types.
+/// Converts packed vector operands to the expected ROCDL types.
static Value convertPackedVectorOperand(ConversionPatternRewriter &rewriter,
Location loc, Value input,
bool allowBf16 = true) {
>From cb7f71fc4d81a1e87b2549b0bb6aa18ee7698180 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:13:07 -0700
Subject: [PATCH 6/7] nits
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index d7de80e008cf5..1b396e484d807 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1111,8 +1111,7 @@ def AMDGPU_DotOp :
The `amdgpu.dot` op is an MLIR wrapper over the `v_dot*` family of intrinsics,
which compute `D = sum_i A[i] * B[i] + C`.
- Variants (source, dest, signedness, chipset -> intrinsic). For the precise
- per-feature chipset enablement, see `hasDot{N}Insts` in `Chipset.h`.
+ Variants (source, dest, signedness, chipset -> intrinsic).
```text
| A elem | B elem | destC | signedness | chipset | ROCDL op |
>From 83eb0691299d76a96472e5fd29fc48e7363717d5 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Wed, 22 Apr 2026 22:18:27 -0700
Subject: [PATCH 7/7] remove utils test
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
.../Dialect/AMDGPU/AMDGPUUtilsTest.cpp | 84 -------------------
1 file changed, 84 deletions(-)
diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
index a70067ca52ce5..570d56f3c6ff1 100644
--- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
+++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -58,89 +58,5 @@ TEST(ChipsetTest, Comparison) {
EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
}
-TEST(ChipsetTest, HasDot1Insts) {
- // gfx9: enabled from gfx906 onward.
- EXPECT_FALSE(hasDot1Insts(Chipset(9, 0, 0)));
- EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 6)));
- EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 8)));
- EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 0xa)));
- EXPECT_TRUE(hasDot1Insts(Chipset(9, 4, 2)));
- EXPECT_TRUE(hasDot1Insts(Chipset(9, 5, 0)));
-
- // gfx10: only gfx10.1.1, gfx10.1.2, and gfx10.3+ enable Dot1.
- EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 0))); // gfx1010
- EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 1))); // gfx1011
- EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 2))); // gfx1012
- EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 3))); // gfx1013
- EXPECT_TRUE(hasDot1Insts(Chipset(10, 3, 0))); // gfx1030
-
- // Not on gfx11+/gfx12+/gfx13+.
- EXPECT_FALSE(hasDot1Insts(Chipset(11, 0, 0)));
- EXPECT_FALSE(hasDot1Insts(Chipset(12, 0, 0)));
- EXPECT_FALSE(hasDot1Insts(Chipset(12, 5, 0)));
- EXPECT_FALSE(hasDot1Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot7Insts) {
- // Same as Dot1 plus all of gfx11+/gfx12+/gfx13+.
- EXPECT_FALSE(hasDot7Insts(Chipset(9, 0, 0)));
- EXPECT_TRUE(hasDot7Insts(Chipset(9, 0, 6)));
- EXPECT_FALSE(hasDot7Insts(Chipset(10, 1, 0)));
- EXPECT_TRUE(hasDot7Insts(Chipset(11, 0, 0)));
- EXPECT_TRUE(hasDot7Insts(Chipset(12, 0, 0)));
- EXPECT_TRUE(hasDot7Insts(Chipset(12, 5, 0))); // gfx1250 still has Dot7.
- EXPECT_TRUE(hasDot7Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot8Insts) {
- // gfx11+ only.
- EXPECT_FALSE(hasDot8Insts(Chipset(9, 4, 2)));
- EXPECT_FALSE(hasDot8Insts(Chipset(10, 3, 0)));
- EXPECT_TRUE(hasDot8Insts(Chipset(11, 0, 0)));
- EXPECT_TRUE(hasDot8Insts(Chipset(12, 5, 0))); // gfx1250 has Dot8.
-}
-
-TEST(ChipsetTest, HasDot9Insts) {
- // gfx11.x and gfx12.0 only.
- EXPECT_FALSE(hasDot9Insts(Chipset(9, 4, 2)));
- EXPECT_FALSE(hasDot9Insts(Chipset(10, 3, 0)));
- EXPECT_TRUE(hasDot9Insts(Chipset(11, 0, 0)));
- EXPECT_TRUE(hasDot9Insts(Chipset(11, 7, 0)));
- EXPECT_TRUE(hasDot9Insts(Chipset(12, 0, 0)));
- EXPECT_FALSE(hasDot9Insts(Chipset(12, 5, 0))); // gfx1250 lacks Dot9.
- EXPECT_FALSE(hasDot9Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot10Insts) {
- // Dot1's set plus gfx11.x and gfx12.0 (excludes gfx12.5+/gfx13+).
- EXPECT_TRUE(hasDot10Insts(Chipset(9, 0, 6)));
- EXPECT_FALSE(hasDot10Insts(Chipset(10, 1, 0)));
- EXPECT_TRUE(hasDot10Insts(Chipset(10, 3, 0)));
- EXPECT_TRUE(hasDot10Insts(Chipset(11, 0, 0)));
- EXPECT_TRUE(hasDot10Insts(Chipset(12, 0, 0)));
- EXPECT_FALSE(hasDot10Insts(Chipset(12, 5, 0)));
- EXPECT_FALSE(hasDot10Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot11Insts) {
- // Only gfx11.7 and gfx12.0.
- EXPECT_FALSE(hasDot11Insts(Chipset(9, 5, 0)));
- EXPECT_FALSE(hasDot11Insts(Chipset(11, 0, 0)));
- EXPECT_FALSE(hasDot11Insts(Chipset(11, 5, 0)));
- EXPECT_TRUE(hasDot11Insts(Chipset(11, 7, 0)));
- EXPECT_TRUE(hasDot11Insts(Chipset(12, 0, 0)));
- EXPECT_FALSE(hasDot11Insts(Chipset(12, 5, 0)));
-}
-
-TEST(ChipsetTest, HasDot12Insts) {
- // gfx9.5.0, gfx11.x, and gfx12.0.
- EXPECT_FALSE(hasDot12Insts(Chipset(9, 0, 6)));
- EXPECT_FALSE(hasDot12Insts(Chipset(9, 4, 2)));
- EXPECT_TRUE(hasDot12Insts(Chipset(9, 5, 0)));
- EXPECT_TRUE(hasDot12Insts(Chipset(11, 0, 0)));
- EXPECT_TRUE(hasDot12Insts(Chipset(12, 0, 0)));
- EXPECT_FALSE(hasDot12Insts(Chipset(12, 5, 0)));
-}
-
} // namespace
} // namespace mlir::amdgpu
More information about the Mlir-commits
mailing list