[Mlir-commits] [mlir] 0d45876 - [ROCDL] Add dot intrinsics to rocdl (#193129)

Tue Apr 21 10:48:30 PDT 2026

Author: Eric Feng
Date: 2026-04-21T17:48:26Z
New Revision: 0d45876e43a8189f1076ecab440f6106d1c97bd7

URL: https://github.com/llvm/llvm-project/commit/0d45876e43a8189f1076ecab440f6106d1c97bd7
DIFF: https://github.com/llvm/llvm-project/commit/0d45876e43a8189f1076ecab440f6106d1c97bd7.diff

LOG: [ROCDL] Add dot intrinsics to rocdl (#193129)

This patch adds dot intrinsic support to the rocdl dialect. Having these
(inc. follow up `amdgpu` wrapper) as first class citizens in MLIR will
allow us to lower thread local reductions involving `<=16bit` data more
effectively. This is in line with the spirit of `dot` intrinsic support
wrt existing edge dialects (`x86`, `nvvm`, `spirv`).

Assisted by: Claude

---------

Signed-off-by: Eric Feng <Eric.Feng at amd.com>

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
    mlir/test/Dialect/LLVMIR/rocdl.mlir
    mlir/test/Target/LLVMIR/rocdl.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index b13206ce5c342..c887598626d17 100644

--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -1425,6 +1425,125 @@ def ROCDL_wmma_scale_f32_32x16x128_f4       : ROCDL_WMMA_Scale_F4_IntrOp<"wmma.s
 def ROCDL_wmma_scale16_f32_32x16x128_f4     : ROCDL_WMMA_Scale_F4_IntrOp<"wmma.scale16.f32.32x16x128.f4", AnyInteger, F32, I64>;
 
 
+//===---------------------------------------------------------------------===//
+// Dot product intrinsics (v_dot*)
+class ROCDL_Dot_IntrOp<string mnemonic, ROCDL_NamedType A, ROCDL_NamedType B,
+                       ROCDL_NamedType C> :
+    ROCDL_ConcreteNonMemIntrOp<mnemonic, [Pure], 1, [3], ["clamp"]>,
+  Arguments<(ins A:$a, B:$b, C:$c,
+                 DefaultValuedAttr<I1Attr, "0">:$clamp)> {
+  let results = (outs C:$res);
+  let assemblyFormat = [{
+    $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
+  }];
+  let description = [{
+    Packed intra-lane dot-product with optional result clamping (`clamp`).
+    Computes `res = sum_i a[i]*b[i] + c`, where `a` and `b` hold packed
+    4/8/16-bit data (for `dot2`,`dot4`,`dot8`).
+
+    Example:
+    ```mlir
+    %r = rocdl.}] # mnemonic # [{ %a, %b, %c {clamp = true} :
+         (}] # A.typeName # [{, }] # B.typeName # [{, }] # C.typeName # [{) -> }]
+         # C.typeName # [{
+    ```
+  }];
+}
+
+class ROCDL_Dot_NoClamp_IntrOp<string mnemonic, ROCDL_NamedType A,
+                               ROCDL_NamedType B, ROCDL_NamedType C> :
+    ROCDL_ConcreteNonMemIntrOp<mnemonic, [Pure], 1, [], []>,
+  Arguments<(ins A:$a, B:$b, C:$c)> {
+  let results = (outs C:$res);
+  let assemblyFormat = [{
+    $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
+  }];
+  let description = [{
+    Packed intra-lane dot-product with no clamp control.
+    Computes `res = sum_i a[i]*b[i] + c`. Covers the full-f16/bf16
+    accumulator forms (`fdot2.f16.f16`, `fdot2.bf16.bf16`) and the
+    FP8/BF8 `dot4.f32.*` variants, whose hardware instructions have no
+    CLAMP bit in their modifier word.
+
+    Example:
+    ```mlir
+    %r = rocdl.}] # mnemonic # [{ %a, %b, %c : (}] # A.typeName # [{, }]
+         # B.typeName # [{, }] # C.typeName # [{) -> }] # C.typeName # [{
+    ```
+  }];
+}
+
+class ROCDL_Sudot_IntrOp<string mnemonic> :
+    ROCDL_ConcreteNonMemIntrOp<mnemonic, [Pure], 1, [0, 2, 5],
+                               ["signA", "signB", "clamp"]>,
+  Arguments<(ins DefaultValuedAttr<I1Attr, "0">:$signA,
+                 I32:$a,
+                 DefaultValuedAttr<I1Attr, "0">:$signB,
+                 I32:$b,
+                 I32:$c,
+                 DefaultValuedAttr<I1Attr, "0">:$clamp)> {
+  let results = (outs I32:$res);
+  let assemblyFormat = [{
+    $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
+  }];
+  let description = [{
+    Mixed-signedness packed dot-product with per-operand sign controls.
+    Computes `res = sum_i a[i]*b[i] + c`. Each lane of `a` is treated as
+    signed when `signA = true`; when `signA = false`, the unsigned lane
+    value is zero-extended into a wider signed integer. `signB` controls
+    the same for `b`. `clamp` controls result clamping.
+
+    These ops correspond to RDNA's unified mixed-sign `v_dot4_i32_iu8`
+    and `v_dot8_i32_iu4` instructions (gfx11+). 
+
+    Example:
+    ```mlir
+    %r = rocdl.}] # mnemonic # [{ %a, %b, %c
+           {signA = true, signB = false, clamp = true} :
+         (i32, i32, i32) -> i32
+    ```
+  }];
+}
+
+// Available from gfx906.
+def ROCDL_fdot2 : ROCDL_Dot_IntrOp<"fdot2",
+    ROCDL_V2F16Type, ROCDL_V2F16Type, ROCDL_Scalar<F32>>;
+def ROCDL_sdot2 : ROCDL_Dot_IntrOp<"sdot2",
+    ROCDL_V2I16Type, ROCDL_V2I16Type, ROCDL_Scalar<I32>>;
+def ROCDL_udot2 : ROCDL_Dot_IntrOp<"udot2",
+    ROCDL_V2I16Type, ROCDL_V2I16Type, ROCDL_Scalar<I32>>;
+def ROCDL_sdot4 : ROCDL_Dot_IntrOp<"sdot4",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+def ROCDL_udot4 : ROCDL_Dot_IntrOp<"udot4",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+def ROCDL_sdot8 : ROCDL_Dot_IntrOp<"sdot8",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+def ROCDL_udot8 : ROCDL_Dot_IntrOp<"udot8",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+
+// Available from gfx11.
+def ROCDL_fdot2_f16_f16   : ROCDL_Dot_NoClamp_IntrOp<"fdot2.f16.f16",
+    ROCDL_V2F16Type,  ROCDL_V2F16Type,  ROCDL_Scalar<F16>>;
+def ROCDL_fdot2_bf16_bf16 : ROCDL_Dot_NoClamp_IntrOp<"fdot2.bf16.bf16",
+    ROCDL_V2BF16Type, ROCDL_V2BF16Type, ROCDL_Scalar<BF16>>;
+def ROCDL_sudot4 : ROCDL_Sudot_IntrOp<"sudot4">;
+def ROCDL_sudot8 : ROCDL_Sudot_IntrOp<"sudot8">;
+
+// Available from gfx11 and gfx950.
+def ROCDL_fdot2_f32_bf16 : ROCDL_Dot_IntrOp<"fdot2.f32.bf16",
+    ROCDL_V2BF16Type, ROCDL_V2BF16Type, ROCDL_Scalar<F32>>;
+
+// Available from gfx12.
+def ROCDL_dot4_f32_fp8_fp8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.fp8.fp8",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+def ROCDL_dot4_f32_fp8_bf8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.fp8.bf8",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+def ROCDL_dot4_f32_bf8_fp8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.bf8.fp8",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+def ROCDL_dot4_f32_bf8_bf8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.bf8.bf8",
+    ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+
+
 //===---------------------------------------------------------------------===//
 // SWMMAC intrinsics
 class ROCDL_SWMMAC_V0_IntrOp<string mnemonic, Type AB, Type CD> : ROCDL_IntrOp<mnemonic,

diff  --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 1d835b352e519..1da7b63efdd7e 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1627,6 +1627,107 @@ llvm.func @rocdl.swmmac(%v32f16 : vector<32xf16>, %v32bf16 : vector<32xbf16>,
   llvm.return %w32_0 : vector<8xf32>
 }
 
+// -----
+
+// CHECK-LABEL: @rocdl_dot_fdot2_family
+llvm.func @rocdl_dot_fdot2_family(%v2f16: vector<2xf16>, %v2bf16: vector<2xbf16>,
+                                  %f16: f16, %bf16: bf16, %f32: f32) -> f32 {
+  // CHECK: rocdl.fdot2 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+  %r0 = rocdl.fdot2 %v2f16, %v2f16, %f32 : (vector<2xf16>, vector<2xf16>, f32) -> f32
+  // CHECK: rocdl.fdot2 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+  %r0c = rocdl.fdot2 %v2f16, %v2f16, %f32 {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+
+  // CHECK: rocdl.fdot2.f32.bf16 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+  %r1 = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+  // CHECK: rocdl.fdot2.f32.bf16 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+  %r1c = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 {clamp = true} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+
+  // CHECK: rocdl.fdot2.f16.f16 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xf16>, vector<2xf16>, f16) -> f16
+  %r3 = rocdl.fdot2.f16.f16 %v2f16, %v2f16, %f16 : (vector<2xf16>, vector<2xf16>, f16) -> f16
+
+  // CHECK: rocdl.fdot2.bf16.bf16 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+  %r4 = rocdl.fdot2.bf16.bf16 %v2bf16, %v2bf16, %bf16 : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+
+  llvm.return %r0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: @rocdl_dot_sdot_udot_family
+llvm.func @rocdl_dot_sdot_udot_family(%v2i16: vector<2xi16>, %i32: i32) -> i32 {
+  // CHECK: rocdl.sdot2 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  %r0 = rocdl.sdot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  // CHECK: rocdl.sdot2 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  %r0c = rocdl.sdot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+  // CHECK: rocdl.udot2 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  %r1 = rocdl.udot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  // CHECK: rocdl.udot2 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  %r1c = rocdl.udot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+  // CHECK: rocdl.sdot4 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+  %r2 = rocdl.sdot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sdot4 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+  %r2c = rocdl.sdot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: rocdl.udot4 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+  %r3 = rocdl.udot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: rocdl.udot4 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+  %r3c = rocdl.udot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: rocdl.sdot8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+  %r4 = rocdl.sdot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sdot8 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+  %r4c = rocdl.sdot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: rocdl.udot8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+  %r5 = rocdl.udot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: rocdl.udot8 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+  %r5c = rocdl.udot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  llvm.return %r0 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @rocdl_dot_sudot_family
+llvm.func @rocdl_dot_sudot_family(%i32: i32) -> i32 {
+  // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+  %r0 = rocdl.sudot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} {signA = true} : (i32, i32, i32) -> i32
+  %r0a = rocdl.sudot4 %i32, %i32, %i32 {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} {signB = true} : (i32, i32, i32) -> i32
+  %r0b = rocdl.sudot4 %i32, %i32, %i32 {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true, signA = true, signB = true} : (i32, i32, i32) -> i32
+  %r0c = rocdl.sudot4 %i32, %i32, %i32 {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+  %r1 = rocdl.sudot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} {signA = true} : (i32, i32, i32) -> i32
+  %r1a = rocdl.sudot8 %i32, %i32, %i32 {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} {signB = true} : (i32, i32, i32) -> i32
+  %r1b = rocdl.sudot8 %i32, %i32, %i32 {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true, signA = true, signB = true} : (i32, i32, i32) -> i32
+  %r1c = rocdl.sudot8 %i32, %i32, %i32 {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+  llvm.return %r0 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @rocdl_dot_fp8_family
+llvm.func @rocdl_dot_fp8_family(%i32: i32, %f32: f32) -> f32 {
+  // CHECK: rocdl.dot4.f32.fp8.fp8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+  %r0 = rocdl.dot4.f32.fp8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+  // CHECK: rocdl.dot4.f32.fp8.bf8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+  %r1 = rocdl.dot4.f32.fp8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+  // CHECK: rocdl.dot4.f32.bf8.fp8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+  %r2 = rocdl.dot4.f32.bf8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+  // CHECK: rocdl.dot4.f32.bf8.bf8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+  %r3 = rocdl.dot4.f32.bf8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+
+  llvm.return %r0 : f32
+}
 
 // -----
 

diff  --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 483d8896383c3..169702efd312f 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -2098,6 +2098,100 @@ llvm.func @rocdl.cvt.scalef32.sr.pk16(%v16xf32: vector<16xf32>,
   llvm.return
 }
 
+// CHECK-LABEL: @rocdl_dot_fdot2_family
+llvm.func @rocdl_dot_fdot2_family(%v2f16: vector<2xf16>, %v2bf16: vector<2xbf16>,
+                                  %f16: f16, %bf16: bf16, %f32: f32) -> f32 {
+  // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}, i1 false)
+  %r0 = rocdl.fdot2 %v2f16, %v2f16, %f32 : (vector<2xf16>, vector<2xf16>, f32) -> f32
+  // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}, i1 true)
+  %r0c = rocdl.fdot2 %v2f16, %v2f16, %f32 {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+
+  // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %{{.*}}, <2 x bfloat> %{{.*}}, float %{{.*}}, i1 false)
+  %r1 = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+  // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %{{.*}}, <2 x bfloat> %{{.*}}, float %{{.*}}, i1 true)
+  %r1c = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 {clamp = true} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+
+  // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, half %{{.*}})
+  %r3 = rocdl.fdot2.f16.f16 %v2f16, %v2f16, %f16 : (vector<2xf16>, vector<2xf16>, f16) -> f16
+
+  // CHECK: call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %{{.*}}, <2 x bfloat> %{{.*}}, bfloat %{{.*}})
+  %r4 = rocdl.fdot2.bf16.bf16 %v2bf16, %v2bf16, %bf16 : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+
+  llvm.return %r0 : f32
+}
+
+// CHECK-LABEL: @rocdl_dot_sdot_udot_family
+llvm.func @rocdl_dot_sdot_udot_family(%v2i16: vector<2xi16>, %i32: i32) -> i32 {
+  // CHECK: call i32 @llvm.amdgcn.sdot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 false)
+  %r0 = rocdl.sdot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sdot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 true)
+  %r0c = rocdl.sdot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+  // CHECK: call i32 @llvm.amdgcn.udot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 false)
+  %r1 = rocdl.udot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.udot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 true)
+  %r1c = rocdl.udot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+  // CHECK: call i32 @llvm.amdgcn.sdot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r2 = rocdl.sdot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sdot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+  %r2c = rocdl.sdot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: call i32 @llvm.amdgcn.udot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r3 = rocdl.udot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.udot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+  %r3c = rocdl.udot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: call i32 @llvm.amdgcn.sdot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r4 = rocdl.sdot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sdot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+  %r4c = rocdl.sdot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: call i32 @llvm.amdgcn.udot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r5 = rocdl.udot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.udot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+  %r5c = rocdl.udot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+  llvm.return %r0 : i32
+}
+
+// CHECK-LABEL: @rocdl_dot_sudot_family
+llvm.func @rocdl_dot_sudot_family(%a: i32, %b: i32, %c: i32) -> i32 {
+  // CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r0 = rocdl.sudot4 %a, %b, %c : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r0a = rocdl.sudot4 %a, %b, %c {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r0b = rocdl.sudot4 %a, %b, %c {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+  %r0c = rocdl.sudot4 %a, %b, %c {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+  // CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r1 = rocdl.sudot8 %a, %b, %c : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r1a = rocdl.sudot8 %a, %b, %c {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+  %r1b = rocdl.sudot8 %a, %b, %c {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+  // CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+  %r1c = rocdl.sudot8 %a, %b, %c {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+  llvm.return %r0 : i32
+}
+
+// CHECK-LABEL: @rocdl_dot_fp8_family
+llvm.func @rocdl_dot_fp8_family(%i32: i32, %f32: f32) -> f32 {
+  // CHECK: call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+  %r0 = rocdl.dot4.f32.fp8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+  // CHECK: call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+  %r1 = rocdl.dot4.f32.fp8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+  // CHECK: call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+  %r2 = rocdl.dot4.f32.bf8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+  // CHECK: call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+  %r3 = rocdl.dot4.f32.bf8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+
+  llvm.return %r0 : f32
+}
+
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"