[Mlir-commits] [mlir] [AMDGPU] Implement amdgpu.dot op (PR #193371)

Wed Apr 22 22:23:31 PDT 2026

https://github.com/efric updated https://github.com/llvm/llvm-project/pull/193371

>From 2f07948ae1c912f123cac06e4a2d5ac473579a14 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:01:48 -0700
Subject: [PATCH 1/7] amdgpu dot ops

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 .../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td       |  58 ++++++
 .../mlir/Dialect/AMDGPU/Utils/Chipset.h       |  58 +++++-
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 173 +++++++++++++++++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp      |  50 +++++
 .../Conversion/AMDGPUToROCDL/dot-gfx11.mlir   |  62 +++++++
 .../Conversion/AMDGPUToROCDL/dot-gfx12.mlir   |  31 ++++
 .../Conversion/AMDGPUToROCDL/dot-gfx9.mlir    |  65 +++++++
 .../Conversion/AMDGPUToROCDL/dot-invalid.mlir |  41 +++++
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  99 ++++++++++
 mlir/test/Dialect/AMDGPU/ops.mlir             |  74 ++++++++
 .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp        |  84 +++++++++
 11 files changed, 786 insertions(+), 9 deletions(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index f8d4a3fdadf6b..1afc941332131 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1091,6 +1091,64 @@ def AMDGPU_WMMAOp :
   let hasVerifier = 1;
 }
 
+def DotInTypes : AnyTypeOf<[VectorOfLengthAndType<[2], [F16, BF16, I16]>,
+                            VectorOfLengthAndType<[4], [I8, F8E4M3FN, F8E5M2]>,
+                            VectorOfLengthAndType<[8], [I<4>]>]>;
+
+def DotOutTypes : AnyTypeOf<[F32, F16, BF16, I32]>;
+
+def AMDGPU_DotOp :
+    AMDGPU_Op<"dot", [AllTypesMatch<["destC", "destD"]>, Pure]>,
+    Arguments<(ins DotInTypes:$sourceA,
+                   DotInTypes:$sourceB,
+                   DotOutTypes:$destC,
+                   UnitAttr:$unsignedA,
+                   UnitAttr:$unsignedB,
+                   UnitAttr:$clamp)>,
+    Results<(outs DotOutTypes:$destD)> {
+  let summary = "MLIR wrapper for AMDGPU v_dot* intrinsics";
+  let description = [{
+    The `amdgpu.dot` op is an MLIR wrapper over the `v_dot*` family of intrinsics,
+    which compute `D = sum_i A[i] * B[i] + C`. 
+
+    Variants (source, dest, signedness, chipset -> intrinsic). For the precise
+    per-feature chipset enablement, see `hasDot{N}Insts` in `Chipset.h`.
+
+    ```text
+    | A elem   | B elem   | destC | signedness | chipset                   | ROCDL op                     |
+    |----------|----------|-------|------------|---------------------------|------------------------------|
+    | f16      | f16      | f32   | n/a        | gfx906+                   | fdot2                        |
+    | f16      | f16      | f16   | n/a        | gfx11+                    | fdot2.f16.f16                |
+    | bf16     | bf16     | f32   | n/a        | gfx11+, gfx950+           | fdot2.f32.bf16               |
+    | bf16     | bf16     | bf16  | n/a        | gfx11+                    | fdot2.bf16.bf16              |
+    | i16      | i16      | i32   | s / u      | gfx906+, no gfx11+/gfx12+ | sdot2 / udot2                |
+    | i8       | i8       | i32   | s / u      | gfx906+                   | sdot4 / udot4                |
+    | i8       | i8       | i32   | mixed      | gfx11+                    | sudot4                       |
+    | i4       | i4       | i32   | s / u      | gfx906+                   | sdot8 / udot8                |
+    | i4       | i4       | i32   | mixed      | gfx11+                    | sudot8                       |
+    | fp8/bf8  | fp8/bf8  | f32   | n/a        | gfx11.7, gfx12+          | dot4.f32.{fp8,bf8}.{fp8,bf8} |
+    ```
+
+    Example:
+    ```mlir
+    %r0 = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+    %r1 = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp}
+        : vector<8xi4>, vector<8xi4>, i32
+    %r2 = amdgpu.dot %a * %b + %c {unsignedB}
+        : vector<4xi8>, vector<4xi8>, i32
+    %r3 = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f32
+    %r4 = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+    %r5 = amdgpu.dot %a * %b + %c
+        : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+    ```
+  }];
+  let assemblyFormat = [{
+    $sourceA `*` $sourceB `+` $destC attr-dict
+      `:` type($sourceA) `,` type($sourceB) `,` type($destC)
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_SparseMFMAOp :
     AMDGPU_Op<"sparse_mfma", [AllTypesMatch<["destC", "destD"]>,
                               Pure]>,
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
index ca9809799588c..3dab1eba9b526 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
@@ -15,10 +15,10 @@ namespace mlir::amdgpu {
 
 /// Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
 /// Note that the leading digits form a decimal number, while the last two
-/// digits for a hexadecimal number. For example:
+/// digits form a hexadecimal number. For example:
 ///   gfx942  --> major = 9, minor = 0x4, stepping = 0x2
 ///   gfx90a  --> major = 9, minor = 0x0, stepping = 0xa
-///   gfx1103 --> major = 10, minor = 0x0, stepping = 0x3
+///   gfx1103 --> major = 11, minor = 0x0, stepping = 0x3
 struct Chipset {
   unsigned majorVersion = 0;    // The major version (decimal).
   unsigned minorVersion = 0;    // The minor version (hexadecimal).
@@ -54,6 +54,60 @@ inline bool hasOcpFp8(const Chipset &chipset) {
          chipset.majorVersion >= 12;
 }
 
+// Predicates mirroring the LLVM AMDGPU `HasDot{N}Insts` features that gate
+// the `v_dot*` instructions consumed by the `amdgpu.dot` lowering.
+
+inline bool hasDot1Insts(const Chipset &chipset) {
+  if (chipset.majorVersion == 9)
+    return chipset >= Chipset(9, 0, 6);
+  if (chipset.majorVersion == 10) {
+    if (chipset.minorVersion == 1)
+      return chipset.steppingVersion == 1u || chipset.steppingVersion == 2u;
+    return chipset.minorVersion >= 3u;
+  }
+  return false;
+}
+
+inline bool hasDot2Insts(const Chipset &chipset) {
+  return hasDot1Insts(chipset);
+}
+
+inline bool hasDot7Insts(const Chipset &chipset) {
+  return chipset.majorVersion >= 11 || hasDot1Insts(chipset);
+}
+
+inline bool hasDot8Insts(const Chipset &chipset) {
+  return chipset.majorVersion >= 11;
+}
+
+inline bool hasDot9Insts(const Chipset &chipset) {
+  if (chipset.majorVersion == 11)
+    return true;
+  return chipset.majorVersion == 12 && chipset.minorVersion == 0;
+}
+
+inline bool hasDot10Insts(const Chipset &chipset) {
+  if (chipset.majorVersion == 11)
+    return true;
+  if (chipset.majorVersion == 12)
+    return chipset.minorVersion == 0;
+  return hasDot1Insts(chipset);
+}
+
+inline bool hasDot11Insts(const Chipset &chipset) {
+  if (chipset.majorVersion == 11)
+    return chipset.minorVersion == 7u;
+  return chipset.majorVersion == 12 && chipset.minorVersion == 0;
+}
+
+inline bool hasDot12Insts(const Chipset &chipset) {
+  if (chipset == Chipset(9, 5, 0))
+    return true;
+  if (chipset.majorVersion == 11)
+    return true;
+  return chipset.majorVersion == 12 && chipset.minorVersion == 0;
+}
+
 } // namespace mlir::amdgpu
 
 #endif
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 14d99c250c0b6..a161d0da29aee 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -685,7 +685,7 @@ static Value packSmallFloatVectorOperand(ConversionPatternRewriter &rewriter,
 
 /// Converts sparse MFMA/WMMA (smfmac/swmmac) operands to the expected ROCDL
 /// types.
-static Value convertSparseVectorOperand(ConversionPatternRewriter &rewriter,
+static Value convertPackedVectorOperand(ConversionPatternRewriter &rewriter,
                                         Location loc, Value input,
                                         bool allowBf16 = true) {
   Type inputType = input.getType();
@@ -1646,9 +1646,9 @@ struct SparseMFMAOpLowering : public ConvertOpToLLVMPattern<SparseMFMAOp> {
       return op->emitOpError("sparse MFMA (smfmac) only supported on gfx942+");
     bool isGfx950 = chipset >= kGfx950;
 
-    Value a = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceA(),
+    Value a = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceA(),
                                          isGfx950);
-    Value b = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceB(),
+    Value b = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceB(),
                                          isGfx950);
     Value c = adaptor.getDestC();
 
@@ -1753,6 +1753,165 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
   }
 };
 
+enum class DotFamily {
+  /// ROCDL_Dot_IntrOp: single `clamp` attribute.
+  Clamp,
+  /// ROCDL_Dot_NoClamp_IntrOp: no attributes.
+  NoClamp,
+  /// ROCDL_Sudot_IntrOp: `signA`, `signB`, and `clamp` attributes.
+  Sudot,
+};
+
+static std::optional<std::pair<StringRef, DotFamily>>
+dotOpToIntrinsic(DotOp op, Chipset chipset) {
+  Type aElem = cast<VectorType>(op.getSourceA().getType()).getElementType();
+  Type bElem = cast<VectorType>(op.getSourceB().getType()).getElementType();
+  Type dest = op.getDestC().getType();
+  bool uA = op.getUnsignedA();
+  bool uB = op.getUnsignedB();
+
+  // f16 x f16 -> f32 / f16.
+  if (aElem.isF16() && bElem.isF16()) {
+    if (dest.isF32() && hasDot10Insts(chipset))
+      return {{ROCDL::fdot2::getOperationName(), DotFamily::Clamp}};
+    if (dest.isF16() && hasDot9Insts(chipset))
+      return {{ROCDL::fdot2_f16_f16::getOperationName(), DotFamily::NoClamp}};
+    return std::nullopt;
+  }
+
+  // bf16 x bf16 -> f32 / bf16.
+  if (aElem.isBF16() && bElem.isBF16()) {
+    if (dest.isF32() && hasDot12Insts(chipset))
+      return {{ROCDL::fdot2_f32_bf16::getOperationName(), DotFamily::Clamp}};
+    if (dest.isBF16() && hasDot9Insts(chipset))
+      return {{ROCDL::fdot2_bf16_bf16::getOperationName(),
+               DotFamily::NoClamp}};
+    return std::nullopt;
+  }
+
+  // Integer sources -> i32.
+  if (isa<IntegerType>(aElem) && isa<IntegerType>(bElem) &&
+      dest.isInteger(32)) {
+    bool mixedSign = (uA != uB);
+    unsigned elemWidth = aElem.getIntOrFloatBitWidth();
+
+    if (mixedSign) {
+      if (!hasDot8Insts(chipset))
+        return std::nullopt;
+      StringRef name;
+      switch (elemWidth) {
+      case 8:
+        name = ROCDL::sudot4::getOperationName();
+        break;
+      case 4:
+        name = ROCDL::sudot8::getOperationName();
+        break;
+      default:
+        return std::nullopt;
+      }
+      return {{name, DotFamily::Sudot}};
+    }
+
+    StringRef name;
+    bool supported = false;
+    switch (elemWidth) {
+    case 16:
+      supported = hasDot2Insts(chipset);
+      name = uA ? ROCDL::udot2::getOperationName()
+                : ROCDL::sdot2::getOperationName();
+      break;
+    case 8:
+      supported = uA ? hasDot7Insts(chipset)
+                     : hasDot1Insts(chipset) || hasDot8Insts(chipset);
+      name = uA ? ROCDL::udot4::getOperationName()
+                : ROCDL::sdot4::getOperationName();
+      break;
+    case 4:
+      supported = uA ? hasDot7Insts(chipset)
+                     : hasDot1Insts(chipset) || hasDot8Insts(chipset);
+      name = uA ? ROCDL::udot8::getOperationName()
+                : ROCDL::sdot8::getOperationName();
+      break;
+    default:
+      return std::nullopt;
+    }
+    if (!supported)
+      return std::nullopt;
+    return {{name, DotFamily::Clamp}};
+  }
+
+  // fp8/bf8 x fp8/bf8 -> f32.
+  bool aIsFp8 = isa<Float8E4M3FNType>(aElem);
+  bool aIsBf8 = isa<Float8E5M2Type>(aElem);
+  bool bIsFp8 = isa<Float8E4M3FNType>(bElem);
+  bool bIsBf8 = isa<Float8E5M2Type>(bElem);
+  if ((aIsFp8 || aIsBf8) && (bIsFp8 || bIsBf8) && dest.isF32()) {
+    if (!hasDot11Insts(chipset))
+      return std::nullopt;
+    StringRef name;
+    if (aIsFp8 && bIsFp8)
+      name = ROCDL::dot4_f32_fp8_fp8::getOperationName();
+    else if (aIsFp8 && bIsBf8)
+      name = ROCDL::dot4_f32_fp8_bf8::getOperationName();
+    else if (aIsBf8 && bIsFp8)
+      name = ROCDL::dot4_f32_bf8_fp8::getOperationName();
+    else
+      name = ROCDL::dot4_f32_bf8_bf8::getOperationName();
+    return {{name, DotFamily::NoClamp}};
+  }
+
+  return std::nullopt;
+}
+
+struct DotOpLowering : public ConvertOpToLLVMPattern<DotOp> {
+  DotOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<DotOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(DotOp op, DotOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    std::optional<std::pair<StringRef, DotFamily>> maybeIntrinsic =
+        dotOpToIntrinsic(op, chipset);
+    if (!maybeIntrinsic)
+      return op.emitOpError(
+                 "no intrinsic matching dot on the given chipset: ")
+             << op.getSourceA().getType() << " * " << op.getSourceB().getType()
+             << " + " << op.getDestC().getType();
+
+    auto [intrinsicName, family] = maybeIntrinsic.value();
+
+    Value a = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceA());
+    Value b = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceB());
+    Value c = adaptor.getDestC();
+
+    SmallVector<NamedAttribute, 3> attrs;
+    if (family == DotFamily::Sudot) {
+      attrs.push_back(rewriter.getNamedAttr(
+          "signA", rewriter.getBoolAttr(!op.getUnsignedA())));
+      attrs.push_back(rewriter.getNamedAttr(
+          "signB", rewriter.getBoolAttr(!op.getUnsignedB())));
+    }
+
+    if (family != DotFamily::NoClamp && op.getClamp())
+      attrs.push_back(
+          rewriter.getNamedAttr("clamp", rewriter.getBoolAttr(true)));
+
+    Type resultType = typeConverter->convertType(op.getDestD().getType());
+
+    OperationState loweredOp(loc, intrinsicName);
+    loweredOp.addTypes(resultType);
+    loweredOp.addOperands({a, b, c});
+    loweredOp.addAttributes(attrs);
+    Operation *lowered = rewriter.create(loweredOp);
+    rewriter.replaceOp(op, lowered->getResults());
+    return success();
+  }
+};
+
 struct SparseWMMAOpLowering : public ConvertOpToLLVMPattern<SparseWMMAOp> {
   SparseWMMAOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
       : ConvertOpToLLVMPattern<SparseWMMAOp>(converter), chipset(chipset) {}
@@ -1803,14 +1962,14 @@ struct SparseWMMAOpLowering : public ConvertOpToLLVMPattern<SparseWMMAOp> {
 
     const bool isGFX1250orHigher =
         chipset.majorVersion == 12 && chipset.minorVersion >= 5;
-    Value a = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceA(),
+    Value a = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceA(),
                                          isGFX1250orHigher);
-    Value b = convertSparseVectorOperand(rewriter, loc, adaptor.getSourceB(),
+    Value b = convertPackedVectorOperand(rewriter, loc, adaptor.getSourceB(),
                                          isGFX1250orHigher);
     Value c = adaptor.getDestC();
     VectorType rawOutType = outType;
     if (!isGFX1250orHigher) {
-      c = convertSparseVectorOperand(rewriter, loc, adaptor.getDestC(), false);
+      c = convertPackedVectorOperand(rewriter, loc, adaptor.getDestC(), false);
       rawOutType = cast<VectorType>(c.getType());
     }
 
@@ -4191,7 +4350,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
            AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
            SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
            SparseMFMAOpLowering, WMMAOpLowering, ScaledWMMAOpLowering,
-           SparseWMMAOpLowering, ExtPackedFp8OpLowering,
+           SparseWMMAOpLowering, DotOpLowering, ExtPackedFp8OpLowering,
            ScaledExtPackedMatrixOpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index d4811275b6fd6..f19b0f3f9edf5 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -741,6 +741,56 @@ LogicalResult SparseWMMAOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DotOp
+//===----------------------------------------------------------------------===//
+LogicalResult DotOp::verify() {
+  Type aElem = cast<VectorType>(getSourceA().getType()).getElementType();
+  Type bElem = cast<VectorType>(getSourceB().getType()).getElementType();
+  Type dest = getDestC().getType();
+
+  bool aIsFloat8 = aElem.isFloat(8);
+  bool bIsFloat8 = bElem.isFloat(8);
+  bool aIsInteger = isa<IntegerType>(aElem);
+
+  bool bothFloat8 = aIsFloat8 && bIsFloat8;
+  if (!bothFloat8 && aElem != bElem)
+    return emitOpError(
+        "expected source operands to have the same element type");
+
+  if (aElem.isF16()) {
+    if (!dest.isF32() && !dest.isF16())
+      return emitOpError("expected f32 or f16 accumulator for f16 sources");
+  } else if (aElem.isBF16()) {
+    if (!dest.isF32() && !dest.isBF16())
+      return emitOpError("expected f32 or bf16 accumulator for bf16 sources");
+  } else if (aIsInteger) {
+    if (!dest.isInteger(32))
+      return emitOpError("expected i32 accumulator for integer sources");
+  } else if (aIsFloat8) {
+    if (!dest.isF32())
+      return emitOpError("expected f32 accumulator for fp8 sources");
+  }
+
+  if ((getUnsignedA() || getUnsignedB()) && !aIsInteger)
+    return emitOpError(
+        "unsignedA/unsignedB are only valid for integer source types");
+
+  if (aElem.isInteger(16) && getUnsignedA() != getUnsignedB())
+    return emitOpError(
+        "mixed-sign dot is not supported for 16-bit integer sources");
+
+  if (getClamp()) {
+    bool noClamp = (aElem.isF16() && dest.isF16()) ||
+                   (aElem.isBF16() && dest.isBF16()) || aIsFloat8;
+    if (noClamp)
+      return emitOpError(
+          "clamp is not supported for this (source, accumulator) combination");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // DPPOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
new file mode 100644
index 0000000000000..51d553748ad6a
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s
+
+// Variants first available on gfx11: fdot2 variants with narrower
+// accumulators, fdot2.f32.bf16, and the mixed-sign sudot* ops.
+
+// CHECK-LABEL: @dot_fdot2_f16_f16
+func.func @dot_fdot2_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+  // CHECK: rocdl.fdot2.f16.f16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f16) -> f16
+  %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+  func.return %r : f16
+}
+
+// CHECK-LABEL: @dot_fdot2_bf16_bf16
+func.func @dot_fdot2_bf16_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: bf16) -> bf16 {
+  // CHECK: rocdl.fdot2.bf16.bf16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+  %r = amdgpu.dot %a * %b + %c : vector<2xbf16>, vector<2xbf16>, bf16
+  func.return %r : bf16
+}
+
+// CHECK-LABEL: @dot_fdot2_f32_bf16
+func.func @dot_fdot2_f32_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+  // CHECK: rocdl.fdot2.f32.bf16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c : vector<2xbf16>, vector<2xbf16>, f32
+  func.return %r : f32
+}
+
+// Uniform-sign sdot4 still dispatches to the dedicated rocdl.sdot4 (not
+// sudot4) on gfx11+. The backend aliases v_dot4_i32_i8 to v_dot4_i32_iu8
+// at llvm/lib/Target/AMDGPU/VOP3PInstructions.td:2647, so this produces
+// identical machine code to the gfx9 lowering.
+
+// CHECK-LABEL: @dot_sdot4_gfx11_uniform_sign
+func.func @dot_sdot4_gfx11_uniform_sign(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // CHECK: rocdl.sdot4 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
+
+// Mixed-sign i8 dot → rocdl.sudot4.
+
+// CHECK-LABEL: @dot_sudot4_signA_unsignedB
+func.func @dot_sudot4_signA_unsignedB(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // CHECK: rocdl.sudot4 %{{.+}}, %{{.+}}, %{{.+}} {signA = true} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_sudot4_unsignedA_signB_clamp
+func.func @dot_sudot4_unsignedA_signB_clamp(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // CHECK: rocdl.sudot4 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true, signB = true} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c {unsignedA, clamp} : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
+
+// Mixed-sign i4 dot → rocdl.sudot8.
+
+// CHECK-LABEL: @dot_sudot8
+func.func @dot_sudot8(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+  // CHECK: rocdl.sudot8 %{{.+}}, %{{.+}}, %{{.+}} {signA = true} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<8xi4>, vector<8xi4>, i32
+  func.return %r : i32
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir
new file mode 100644
index 0000000000000..3213b5fa8f5c2
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx12.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 | FileCheck %s
+
+// CHECK-LABEL: @dot_fp8_fp8
+func.func @dot_fp8_fp8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+  // CHECK: %[[A:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+  // CHECK: %[[B:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+  // CHECK: rocdl.dot4.f32.fp8.fp8 %[[A]], %[[B]], %{{.+}} : (i32, i32, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+  func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_fp8_bf8
+func.func @dot_fp8_bf8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E5M2>, %c: f32) -> f32 {
+  // CHECK: rocdl.dot4.f32.fp8.bf8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+  func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_bf8_fp8
+func.func @dot_bf8_fp8(%a: vector<4xf8E5M2>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+  // CHECK: rocdl.dot4.f32.bf8.fp8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E5M2>, vector<4xf8E4M3FN>, f32
+  func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_bf8_bf8
+func.func @dot_bf8_bf8(%a: vector<4xf8E5M2>, %b: vector<4xf8E5M2>, %c: f32) -> f32 {
+  // CHECK: rocdl.dot4.f32.bf8.bf8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+  func.return %r : f32
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
new file mode 100644
index 0000000000000..c3b4d459992a7
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx906 | FileCheck %s
+
+// Dot variants available from gfx906. Integer sources arrive unpacked;
+// vector<4xi8> and vector<8xi4> are bitcast to scalar i32 (little-endian
+// lane order) before being passed to ROCDL.
+
+// CHECK-LABEL: @dot_fdot2
+func.func @dot_fdot2(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+  // CHECK: rocdl.fdot2 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f32
+  func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_fdot2_clamp
+func.func @dot_fdot2_clamp(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+  // CHECK: rocdl.fdot2 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+  %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xf16>, vector<2xf16>, f32
+  func.return %r : f32
+}
+
+// CHECK-LABEL: @dot_sdot2
+func.func @dot_sdot2(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+  // CHECK: rocdl.sdot2 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c : vector<2xi16>, vector<2xi16>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_udot2_clamp
+func.func @dot_udot2_clamp(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+  // CHECK: rocdl.udot2 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<2xi16>, vector<2xi16>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_sdot4
+func.func @dot_sdot4(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // CHECK: %[[A:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+  // CHECK: %[[B:.+]] = llvm.bitcast %{{.+}} : vector<4xi8> to i32
+  // CHECK: rocdl.sdot4 %[[A]], %[[B]], %{{.+}} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_udot4_clamp
+func.func @dot_udot4_clamp(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // CHECK: rocdl.udot4 %{{.+}}, %{{.+}}, %{{.+}} {clamp = true} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_sdot8
+func.func @dot_sdot8(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+  // CHECK: %[[A:.+]] = llvm.bitcast %{{.+}} : vector<8xi4> to i32
+  // CHECK: %[[B:.+]] = llvm.bitcast %{{.+}} : vector<8xi4> to i32
+  // CHECK: rocdl.sdot8 %[[A]], %[[B]], %{{.+}} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c : vector<8xi4>, vector<8xi4>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: @dot_udot8
+func.func @dot_udot8(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+  // CHECK: rocdl.udot8 %{{.+}}, %{{.+}}, %{{.+}} : (i32, i32, i32) -> i32
+  %r = amdgpu.dot %a * %b + %c {unsignedA, unsignedB} : vector<8xi4>, vector<8xi4>, i32
+  func.return %r : i32
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
new file mode 100644
index 0000000000000..6265f95b4abd4
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx906 --split-input-file -verify-diagnostics
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx942 --split-input-file -verify-diagnostics
+
+// fp8 dot4 is only available on gfx12+.
+func.func @dot_fp8_requires_gfx12(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+  // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+  // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// fdot2.f16.f16 (f16 accumulator for f16 x f16) requires gfx11+.
+func.func @dot_f16_f16_requires_gfx11(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+  // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+  // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+  %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+  func.return %r : f16
+}
+
+// -----
+
+// fdot2.f32.bf16 is available on gfx11+ and gfx950 (ROCDLOps.td:1532);
+// neither gfx906 nor gfx942 supports it.
+func.func @dot_f32_bf16_requires_gfx11_or_gfx950(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+  // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+  // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+  %r = amdgpu.dot %a * %b + %c : vector<2xbf16>, vector<2xbf16>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// Mixed-sign integer dot (sudot) requires gfx11+.
+func.func @dot_mixed_sign_requires_gfx11(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
+  // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}
+  %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index d7d449bd8a579..4f59ec642171f 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -777,3 +777,102 @@ func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, #gpu.ad
   amdgpu.global_prefetch %src[%i, %j] NT_HT DEV : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
+
+// -----
+
+// DotOp: unsignedA is invalid on a float source.
+func.func @dot_float_source_unsigned_a(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op unsignedA/unsignedB are only valid for integer source types}}
+  %r = amdgpu.dot %a * %b + %c {unsignedA} : vector<2xf16>, vector<2xf16>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: unsignedB is invalid on a float source.
+func.func @dot_float_source_unsigned_b(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op unsignedA/unsignedB are only valid for integer source types}}
+  %r = amdgpu.dot %a * %b + %c {unsignedB} : vector<2xbf16>, vector<2xbf16>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: integer source requires i32 accumulator.
+func.func @dot_integer_bad_accumulator(%a: vector<4xi8>, %b: vector<4xi8>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op expected i32 accumulator for integer sources}}
+  %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: source element types must match for non-fp8 sources.
+func.func @dot_cross_float_elems(%a: vector<2xf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op expected source operands to have the same element type}}
+  %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xbf16>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: fp8 and non-fp8 sources cannot be mixed.
+func.func @dot_fp8_mixed_with_int(%a: vector<4xf8E4M3FN>, %b: vector<4xi8>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op expected source operands to have the same element type}}
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xi8>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: fp8 source requires f32 accumulator.
+func.func @dot_fp8_bad_accumulator(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f16) -> f16 {
+  // expected-error at +1 {{'amdgpu.dot' op expected f32 accumulator for fp8 sources}}
+  %r = amdgpu.dot %a * %b + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f16
+  func.return %r : f16
+}
+
+// -----
+
+// DotOp: clamp is illegal for (f16, f16) — no clamp bit in fdot2.f16.f16.
+func.func @dot_clamp_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+  // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+  %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xf16>, vector<2xf16>, f16
+  func.return %r : f16
+}
+
+// -----
+
+// DotOp: clamp is illegal for (bf16, bf16) — no clamp bit in fdot2.bf16.bf16.
+func.func @dot_clamp_bf16_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: bf16) -> bf16 {
+  // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+  %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xbf16>, vector<2xbf16>, bf16
+  func.return %r : bf16
+}
+
+// -----
+
+// DotOp: clamp is illegal for any fp8 variant — no clamp bit in dot4.f32.*.
+func.func @dot_clamp_fp8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+  %r = amdgpu.dot %a * %b + %c {clamp} : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: clamp is illegal for bf8 (F8E5M2) sources as well.
+func.func @dot_clamp_bf8(%a: vector<4xf8E5M2>, %b: vector<4xf8E5M2>, %c: f32) -> f32 {
+  // expected-error at +1 {{'amdgpu.dot' op clamp is not supported for this (source, accumulator) combination}}
+  %r = amdgpu.dot %a * %b + %c {clamp} : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+  func.return %r : f32
+}
+
+// -----
+
+// DotOp: mixed-sign i16 dot has no hardware support (no sudot2 intrinsic).
+func.func @dot_mixed_sign_i16(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+  // expected-error at +1 {{'amdgpu.dot' op mixed-sign dot is not supported for 16-bit integer sources}}
+  %r = amdgpu.dot %a * %b + %c {unsignedA} : vector<2xi16>, vector<2xi16>, i32
+  func.return %r : i32
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 6f4dd486610cc..a34550dc25420 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -865,3 +865,77 @@ func.func @ds_barrier_ops(%barrier: memref<!amdgpu.ds_barrier_state, #gpu.addres
   %parity = amdgpu.ds_barrier_state_phase_parity %state : !amdgpu.ds_barrier_state -> i1
   func.return
 }
+
+// CHECK-LABEL: func @dot_f16_f32
+func.func @dot_f16_f32(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
+  // CHECK: amdgpu.dot {{.*}} : vector<2xf16>, vector<2xf16>, f32
+  %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f32
+  // CHECK: amdgpu.dot {{.*}} {clamp} : vector<2xf16>, vector<2xf16>, f32
+  %s = amdgpu.dot %a * %b + %c {clamp} : vector<2xf16>, vector<2xf16>, f32
+  func.return %r : f32
+}
+
+// CHECK-LABEL: func @dot_f16_f16
+func.func @dot_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
+  // CHECK: amdgpu.dot {{.*}} : vector<2xf16>, vector<2xf16>, f16
+  %r = amdgpu.dot %a * %b + %c : vector<2xf16>, vector<2xf16>, f16
+  func.return %r : f16
+}
+
+// CHECK-LABEL: func @dot_bf16
+func.func @dot_bf16(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32, %d: bf16) {
+  // CHECK: amdgpu.dot {{.*}} {clamp} : vector<2xbf16>, vector<2xbf16>, f32
+  %r = amdgpu.dot %a * %b + %c {clamp} : vector<2xbf16>, vector<2xbf16>, f32
+  // CHECK: amdgpu.dot {{.*}} : vector<2xbf16>, vector<2xbf16>, bf16
+  %s = amdgpu.dot %a * %b + %d : vector<2xbf16>, vector<2xbf16>, bf16
+  func.return
+}
+
+// CHECK-LABEL: func @dot_i16
+func.func @dot_i16(%a: vector<2xi16>, %b: vector<2xi16>, %c: i32) -> i32 {
+  // CHECK: amdgpu.dot {{.*}} : vector<2xi16>, vector<2xi16>, i32
+  %r = amdgpu.dot %a * %b + %c : vector<2xi16>, vector<2xi16>, i32
+  // CHECK: amdgpu.dot {{.*}} {clamp} : vector<2xi16>, vector<2xi16>, i32
+  %s = amdgpu.dot %a * %b + %c {clamp} : vector<2xi16>, vector<2xi16>, i32
+  // CHECK: amdgpu.dot {{.*}} {unsignedA, unsignedB} : vector<2xi16>, vector<2xi16>, i32
+  %t = amdgpu.dot %a * %b + %c {unsignedA, unsignedB} : vector<2xi16>, vector<2xi16>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: func @dot_i8
+func.func @dot_i8(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32) -> i32 {
+  // CHECK: amdgpu.dot {{.*}} : vector<4xi8>, vector<4xi8>, i32
+  %r = amdgpu.dot %a * %b + %c : vector<4xi8>, vector<4xi8>, i32
+  // CHECK: amdgpu.dot {{.*}} {clamp, unsignedA, unsignedB} : vector<4xi8>, vector<4xi8>, i32
+  %s = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<4xi8>, vector<4xi8>, i32
+  // CHECK: amdgpu.dot {{.*}} {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+  %t = amdgpu.dot %a * %b + %c {unsignedB} : vector<4xi8>, vector<4xi8>, i32
+  // CHECK: amdgpu.dot {{.*}} {unsignedA} : vector<4xi8>, vector<4xi8>, i32
+  %u = amdgpu.dot %a * %b + %c {unsignedA} : vector<4xi8>, vector<4xi8>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: func @dot_i4
+func.func @dot_i4(%a: vector<8xi4>, %b: vector<8xi4>, %c: i32) -> i32 {
+  // CHECK: amdgpu.dot {{.*}} : vector<8xi4>, vector<8xi4>, i32
+  %r = amdgpu.dot %a * %b + %c : vector<8xi4>, vector<8xi4>, i32
+  // CHECK: amdgpu.dot {{.*}} {clamp, unsignedA, unsignedB} : vector<8xi4>, vector<8xi4>, i32
+  %s = amdgpu.dot %a * %b + %c {unsignedA, unsignedB, clamp} : vector<8xi4>, vector<8xi4>, i32
+  // CHECK: amdgpu.dot {{.*}} {unsignedA} : vector<8xi4>, vector<8xi4>, i32
+  %t = amdgpu.dot %a * %b + %c {unsignedA} : vector<8xi4>, vector<8xi4>, i32
+  func.return %r : i32
+}
+
+// CHECK-LABEL: func @dot_fp8
+func.func @dot_fp8(%a: vector<4xf8E4M3FN>, %b: vector<4xf8E4M3FN>,
+                   %e: vector<4xf8E5M2>, %c: f32) -> f32 {
+  // CHECK: amdgpu.dot {{.*}} : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+  %r0 = amdgpu.dot %a * %a + %c : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, f32
+  // CHECK: amdgpu.dot {{.*}} : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+  %r1 = amdgpu.dot %a * %e + %c : vector<4xf8E4M3FN>, vector<4xf8E5M2>, f32
+  // CHECK: amdgpu.dot {{.*}} : vector<4xf8E5M2>, vector<4xf8E4M3FN>, f32
+  %r2 = amdgpu.dot %e * %a + %c : vector<4xf8E5M2>, vector<4xf8E4M3FN>, f32
+  // CHECK: amdgpu.dot {{.*}} : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+  %r3 = amdgpu.dot %e * %e + %c : vector<4xf8E5M2>, vector<4xf8E5M2>, f32
+  func.return %r0 : f32
+}
diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
index 570d56f3c6ff1..a70067ca52ce5 100644
--- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
+++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -58,5 +58,89 @@ TEST(ChipsetTest, Comparison) {
   EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
 }
 
+TEST(ChipsetTest, HasDot1Insts) {
+  // gfx9: enabled from gfx906 onward.
+  EXPECT_FALSE(hasDot1Insts(Chipset(9, 0, 0)));
+  EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 6)));
+  EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 8)));
+  EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 0xa)));
+  EXPECT_TRUE(hasDot1Insts(Chipset(9, 4, 2)));
+  EXPECT_TRUE(hasDot1Insts(Chipset(9, 5, 0)));
+
+  // gfx10: only gfx10.1.1, gfx10.1.2, and gfx10.3+ enable Dot1.
+  EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 0))); // gfx1010
+  EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 1)));  // gfx1011
+  EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 2)));  // gfx1012
+  EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 3))); // gfx1013
+  EXPECT_TRUE(hasDot1Insts(Chipset(10, 3, 0)));  // gfx1030
+
+  // Not on gfx11+/gfx12+/gfx13+.
+  EXPECT_FALSE(hasDot1Insts(Chipset(11, 0, 0)));
+  EXPECT_FALSE(hasDot1Insts(Chipset(12, 0, 0)));
+  EXPECT_FALSE(hasDot1Insts(Chipset(12, 5, 0)));
+  EXPECT_FALSE(hasDot1Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot7Insts) {
+  // Same as Dot1 plus all of gfx11+/gfx12+/gfx13+.
+  EXPECT_FALSE(hasDot7Insts(Chipset(9, 0, 0)));
+  EXPECT_TRUE(hasDot7Insts(Chipset(9, 0, 6)));
+  EXPECT_FALSE(hasDot7Insts(Chipset(10, 1, 0)));
+  EXPECT_TRUE(hasDot7Insts(Chipset(11, 0, 0)));
+  EXPECT_TRUE(hasDot7Insts(Chipset(12, 0, 0)));
+  EXPECT_TRUE(hasDot7Insts(Chipset(12, 5, 0))); // gfx1250 still has Dot7.
+  EXPECT_TRUE(hasDot7Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot8Insts) {
+  // gfx11+ only.
+  EXPECT_FALSE(hasDot8Insts(Chipset(9, 4, 2)));
+  EXPECT_FALSE(hasDot8Insts(Chipset(10, 3, 0)));
+  EXPECT_TRUE(hasDot8Insts(Chipset(11, 0, 0)));
+  EXPECT_TRUE(hasDot8Insts(Chipset(12, 5, 0))); // gfx1250 has Dot8.
+}
+
+TEST(ChipsetTest, HasDot9Insts) {
+  // gfx11.x and gfx12.0 only.
+  EXPECT_FALSE(hasDot9Insts(Chipset(9, 4, 2)));
+  EXPECT_FALSE(hasDot9Insts(Chipset(10, 3, 0)));
+  EXPECT_TRUE(hasDot9Insts(Chipset(11, 0, 0)));
+  EXPECT_TRUE(hasDot9Insts(Chipset(11, 7, 0)));
+  EXPECT_TRUE(hasDot9Insts(Chipset(12, 0, 0)));
+  EXPECT_FALSE(hasDot9Insts(Chipset(12, 5, 0))); // gfx1250 lacks Dot9.
+  EXPECT_FALSE(hasDot9Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot10Insts) {
+  // Dot1's set plus gfx11.x and gfx12.0 (excludes gfx12.5+/gfx13+).
+  EXPECT_TRUE(hasDot10Insts(Chipset(9, 0, 6)));
+  EXPECT_FALSE(hasDot10Insts(Chipset(10, 1, 0)));
+  EXPECT_TRUE(hasDot10Insts(Chipset(10, 3, 0)));
+  EXPECT_TRUE(hasDot10Insts(Chipset(11, 0, 0)));
+  EXPECT_TRUE(hasDot10Insts(Chipset(12, 0, 0)));
+  EXPECT_FALSE(hasDot10Insts(Chipset(12, 5, 0)));
+  EXPECT_FALSE(hasDot10Insts(Chipset(13, 0, 0)));
+}
+
+TEST(ChipsetTest, HasDot11Insts) {
+  // Only gfx11.7 and gfx12.0.
+  EXPECT_FALSE(hasDot11Insts(Chipset(9, 5, 0)));
+  EXPECT_FALSE(hasDot11Insts(Chipset(11, 0, 0)));
+  EXPECT_FALSE(hasDot11Insts(Chipset(11, 5, 0)));
+  EXPECT_TRUE(hasDot11Insts(Chipset(11, 7, 0)));
+  EXPECT_TRUE(hasDot11Insts(Chipset(12, 0, 0)));
+  EXPECT_FALSE(hasDot11Insts(Chipset(12, 5, 0)));
+}
+
+TEST(ChipsetTest, HasDot12Insts) {
+  // gfx9.5.0, gfx11.x, and gfx12.0.
+  EXPECT_FALSE(hasDot12Insts(Chipset(9, 0, 6)));
+  EXPECT_FALSE(hasDot12Insts(Chipset(9, 4, 2)));
+  EXPECT_TRUE(hasDot12Insts(Chipset(9, 5, 0)));
+  EXPECT_TRUE(hasDot12Insts(Chipset(11, 0, 0)));
+  EXPECT_TRUE(hasDot12Insts(Chipset(12, 0, 0)));
+  EXPECT_FALSE(hasDot12Insts(Chipset(12, 5, 0)));
+}
+
 } // namespace
 } // namespace mlir::amdgpu

>From 1c62f4078c0d1b349182f3defa1775fd05fb0180 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:05:56 -0700
Subject: [PATCH 2/7] format

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td    | 2 +-
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 1afc941332131..d7de80e008cf5 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1126,7 +1126,7 @@ def AMDGPU_DotOp :
     | i8       | i8       | i32   | mixed      | gfx11+                    | sudot4                       |
     | i4       | i4       | i32   | s / u      | gfx906+                   | sdot8 / udot8                |
     | i4       | i4       | i32   | mixed      | gfx11+                    | sudot8                       |
-    | fp8/bf8  | fp8/bf8  | f32   | n/a        | gfx11.7, gfx12+          | dot4.f32.{fp8,bf8}.{fp8,bf8} |
+    | fp8/bf8  | fp8/bf8  | f32   | n/a        | gfx11.7, gfx12+           | dot4.f32.{fp8,bf8}.{fp8,bf8} |
     ```
 
     Example:
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a161d0da29aee..5455a302d4d49 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1784,8 +1784,7 @@ dotOpToIntrinsic(DotOp op, Chipset chipset) {
     if (dest.isF32() && hasDot12Insts(chipset))
       return {{ROCDL::fdot2_f32_bf16::getOperationName(), DotFamily::Clamp}};
     if (dest.isBF16() && hasDot9Insts(chipset))
-      return {{ROCDL::fdot2_bf16_bf16::getOperationName(),
-               DotFamily::NoClamp}};
+      return {{ROCDL::fdot2_bf16_bf16::getOperationName(), DotFamily::NoClamp}};
     return std::nullopt;
   }
 
@@ -1877,8 +1876,7 @@ struct DotOpLowering : public ConvertOpToLLVMPattern<DotOp> {
     std::optional<std::pair<StringRef, DotFamily>> maybeIntrinsic =
         dotOpToIntrinsic(op, chipset);
     if (!maybeIntrinsic)
-      return op.emitOpError(
-                 "no intrinsic matching dot on the given chipset: ")
+      return op.emitOpError("no intrinsic matching dot on the given chipset: ")
              << op.getSourceA().getType() << " * " << op.getSourceB().getType()
              << " + " << op.getDestC().getType();
 

>From 08860105815d8400fcaaf085ab0992d2b12ccf4e Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:07:50 -0700
Subject: [PATCH 3/7] nits

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir | 3 ---
 mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir  | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
index 51d553748ad6a..d5ba51a8a5501 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx11.mlir
@@ -1,8 +1,5 @@
 // RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s
 
-// Variants first available on gfx11: fdot2 variants with narrower
-// accumulators, fdot2.f32.bf16, and the mixed-sign sudot* ops.
-
 // CHECK-LABEL: @dot_fdot2_f16_f16
 func.func @dot_fdot2_f16_f16(%a: vector<2xf16>, %b: vector<2xf16>, %c: f16) -> f16 {
   // CHECK: rocdl.fdot2.f16.f16 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f16) -> f16
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
index c3b4d459992a7..b6f3b9e11c717 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-gfx9.mlir
@@ -1,9 +1,5 @@
 // RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx906 | FileCheck %s
 
-// Dot variants available from gfx906. Integer sources arrive unpacked;
-// vector<4xi8> and vector<8xi4> are bitcast to scalar i32 (little-endian
-// lane order) before being passed to ROCDL.
-
 // CHECK-LABEL: @dot_fdot2
 func.func @dot_fdot2(%a: vector<2xf16>, %b: vector<2xf16>, %c: f32) -> f32 {
   // CHECK: rocdl.fdot2 %{{.+}}, %{{.+}}, %{{.+}} : (vector<2xf16>, vector<2xf16>, f32) -> f32

>From b84d8facf8bdf3a5ffa6d6ee206ba6501f280147 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:09:00 -0700
Subject: [PATCH 4/7] nits

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
index 6265f95b4abd4..02519ac0c6dd1 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dot-invalid.mlir
@@ -21,8 +21,7 @@ func.func @dot_f16_f16_requires_gfx11(%a: vector<2xf16>, %b: vector<2xf16>, %c:
 
 // -----
 
-// fdot2.f32.bf16 is available on gfx11+ and gfx950 (ROCDLOps.td:1532);
-// neither gfx906 nor gfx942 supports it.
+// fdot2.f32.bf16 is available on gfx11+ and gfx950+.
 func.func @dot_f32_bf16_requires_gfx11_or_gfx950(%a: vector<2xbf16>, %b: vector<2xbf16>, %c: f32) -> f32 {
   // expected-error at below {{'amdgpu.dot' op no intrinsic matching dot on the given chipset}}
   // expected-error at below {{failed to legalize operation 'amdgpu.dot'}}

>From 4f2707a5feb973cb5b9590ea2b581854bbb5a1d2 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:12:19 -0700
Subject: [PATCH 5/7] nit

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5455a302d4d49..7f4a58e205bb2 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -683,8 +683,7 @@ static Value packSmallFloatVectorOperand(ConversionPatternRewriter &rewriter,
   return input;
 }
 
-/// Converts sparse MFMA/WMMA (smfmac/swmmac) operands to the expected ROCDL
-/// types.
+/// Converts packed vector operands to the expected ROCDL types.
 static Value convertPackedVectorOperand(ConversionPatternRewriter &rewriter,
                                         Location loc, Value input,
                                         bool allowBf16 = true) {

>From cb7f71fc4d81a1e87b2549b0bb6aa18ee7698180 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Tue, 21 Apr 2026 18:13:07 -0700
Subject: [PATCH 6/7] nits

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index d7de80e008cf5..1b396e484d807 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1111,8 +1111,7 @@ def AMDGPU_DotOp :
     The `amdgpu.dot` op is an MLIR wrapper over the `v_dot*` family of intrinsics,
     which compute `D = sum_i A[i] * B[i] + C`. 
 
-    Variants (source, dest, signedness, chipset -> intrinsic). For the precise
-    per-feature chipset enablement, see `hasDot{N}Insts` in `Chipset.h`.
+    Variants (source, dest, signedness, chipset -> intrinsic).
 
     ```text
     | A elem   | B elem   | destC | signedness | chipset                   | ROCDL op                     |

>From 83eb0691299d76a96472e5fd29fc48e7363717d5 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Wed, 22 Apr 2026 22:18:27 -0700
Subject: [PATCH 7/7] remove utils test

Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
 .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp        | 84 -------------------
 1 file changed, 84 deletions(-)

diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
index a70067ca52ce5..570d56f3c6ff1 100644
--- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
+++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -58,89 +58,5 @@ TEST(ChipsetTest, Comparison) {
   EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
 }
 
-TEST(ChipsetTest, HasDot1Insts) {
-  // gfx9: enabled from gfx906 onward.
-  EXPECT_FALSE(hasDot1Insts(Chipset(9, 0, 0)));
-  EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 6)));
-  EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 8)));
-  EXPECT_TRUE(hasDot1Insts(Chipset(9, 0, 0xa)));
-  EXPECT_TRUE(hasDot1Insts(Chipset(9, 4, 2)));
-  EXPECT_TRUE(hasDot1Insts(Chipset(9, 5, 0)));
-
-  // gfx10: only gfx10.1.1, gfx10.1.2, and gfx10.3+ enable Dot1.
-  EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 0))); // gfx1010
-  EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 1)));  // gfx1011
-  EXPECT_TRUE(hasDot1Insts(Chipset(10, 1, 2)));  // gfx1012
-  EXPECT_FALSE(hasDot1Insts(Chipset(10, 1, 3))); // gfx1013
-  EXPECT_TRUE(hasDot1Insts(Chipset(10, 3, 0)));  // gfx1030
-
-  // Not on gfx11+/gfx12+/gfx13+.
-  EXPECT_FALSE(hasDot1Insts(Chipset(11, 0, 0)));
-  EXPECT_FALSE(hasDot1Insts(Chipset(12, 0, 0)));
-  EXPECT_FALSE(hasDot1Insts(Chipset(12, 5, 0)));
-  EXPECT_FALSE(hasDot1Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot7Insts) {
-  // Same as Dot1 plus all of gfx11+/gfx12+/gfx13+.
-  EXPECT_FALSE(hasDot7Insts(Chipset(9, 0, 0)));
-  EXPECT_TRUE(hasDot7Insts(Chipset(9, 0, 6)));
-  EXPECT_FALSE(hasDot7Insts(Chipset(10, 1, 0)));
-  EXPECT_TRUE(hasDot7Insts(Chipset(11, 0, 0)));
-  EXPECT_TRUE(hasDot7Insts(Chipset(12, 0, 0)));
-  EXPECT_TRUE(hasDot7Insts(Chipset(12, 5, 0))); // gfx1250 still has Dot7.
-  EXPECT_TRUE(hasDot7Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot8Insts) {
-  // gfx11+ only.
-  EXPECT_FALSE(hasDot8Insts(Chipset(9, 4, 2)));
-  EXPECT_FALSE(hasDot8Insts(Chipset(10, 3, 0)));
-  EXPECT_TRUE(hasDot8Insts(Chipset(11, 0, 0)));
-  EXPECT_TRUE(hasDot8Insts(Chipset(12, 5, 0))); // gfx1250 has Dot8.
-}
-
-TEST(ChipsetTest, HasDot9Insts) {
-  // gfx11.x and gfx12.0 only.
-  EXPECT_FALSE(hasDot9Insts(Chipset(9, 4, 2)));
-  EXPECT_FALSE(hasDot9Insts(Chipset(10, 3, 0)));
-  EXPECT_TRUE(hasDot9Insts(Chipset(11, 0, 0)));
-  EXPECT_TRUE(hasDot9Insts(Chipset(11, 7, 0)));
-  EXPECT_TRUE(hasDot9Insts(Chipset(12, 0, 0)));
-  EXPECT_FALSE(hasDot9Insts(Chipset(12, 5, 0))); // gfx1250 lacks Dot9.
-  EXPECT_FALSE(hasDot9Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot10Insts) {
-  // Dot1's set plus gfx11.x and gfx12.0 (excludes gfx12.5+/gfx13+).
-  EXPECT_TRUE(hasDot10Insts(Chipset(9, 0, 6)));
-  EXPECT_FALSE(hasDot10Insts(Chipset(10, 1, 0)));
-  EXPECT_TRUE(hasDot10Insts(Chipset(10, 3, 0)));
-  EXPECT_TRUE(hasDot10Insts(Chipset(11, 0, 0)));
-  EXPECT_TRUE(hasDot10Insts(Chipset(12, 0, 0)));
-  EXPECT_FALSE(hasDot10Insts(Chipset(12, 5, 0)));
-  EXPECT_FALSE(hasDot10Insts(Chipset(13, 0, 0)));
-}
-
-TEST(ChipsetTest, HasDot11Insts) {
-  // Only gfx11.7 and gfx12.0.
-  EXPECT_FALSE(hasDot11Insts(Chipset(9, 5, 0)));
-  EXPECT_FALSE(hasDot11Insts(Chipset(11, 0, 0)));
-  EXPECT_FALSE(hasDot11Insts(Chipset(11, 5, 0)));
-  EXPECT_TRUE(hasDot11Insts(Chipset(11, 7, 0)));
-  EXPECT_TRUE(hasDot11Insts(Chipset(12, 0, 0)));
-  EXPECT_FALSE(hasDot11Insts(Chipset(12, 5, 0)));
-}
-
-TEST(ChipsetTest, HasDot12Insts) {
-  // gfx9.5.0, gfx11.x, and gfx12.0.
-  EXPECT_FALSE(hasDot12Insts(Chipset(9, 0, 6)));
-  EXPECT_FALSE(hasDot12Insts(Chipset(9, 4, 2)));
-  EXPECT_TRUE(hasDot12Insts(Chipset(9, 5, 0)));
-  EXPECT_TRUE(hasDot12Insts(Chipset(11, 0, 0)));
-  EXPECT_TRUE(hasDot12Insts(Chipset(12, 0, 0)));
-  EXPECT_FALSE(hasDot12Insts(Chipset(12, 5, 0)));
-}
-
 } // namespace
 } // namespace mlir::amdgpu