[Mlir-commits] [mlir] 0d45876 - [ROCDL] Add dot intrinsics to rocdl (#193129)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Apr 21 10:48:30 PDT 2026
Author: Eric Feng
Date: 2026-04-21T17:48:26Z
New Revision: 0d45876e43a8189f1076ecab440f6106d1c97bd7
URL: https://github.com/llvm/llvm-project/commit/0d45876e43a8189f1076ecab440f6106d1c97bd7
DIFF: https://github.com/llvm/llvm-project/commit/0d45876e43a8189f1076ecab440f6106d1c97bd7.diff
LOG: [ROCDL] Add dot intrinsics to rocdl (#193129)
This patch adds dot intrinsic support to the rocdl dialect. Having these
(inc. follow up `amdgpu` wrapper) as first class citizens in MLIR will
allow us to lower thread local reductions involving `<=16bit` data more
effectively. This is in line with the spirit of `dot` intrinsic support
wrt existing edge dialects (`x86`, `nvvm`, `spirv`).
Assisted by: Claude
---------
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
Added:
Modified:
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
mlir/test/Dialect/LLVMIR/rocdl.mlir
mlir/test/Target/LLVMIR/rocdl.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index b13206ce5c342..c887598626d17 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -1425,6 +1425,125 @@ def ROCDL_wmma_scale_f32_32x16x128_f4 : ROCDL_WMMA_Scale_F4_IntrOp<"wmma.s
def ROCDL_wmma_scale16_f32_32x16x128_f4 : ROCDL_WMMA_Scale_F4_IntrOp<"wmma.scale16.f32.32x16x128.f4", AnyInteger, F32, I64>;
+//===---------------------------------------------------------------------===//
+// Dot product intrinsics (v_dot*)
+class ROCDL_Dot_IntrOp<string mnemonic, ROCDL_NamedType A, ROCDL_NamedType B,
+ ROCDL_NamedType C> :
+ ROCDL_ConcreteNonMemIntrOp<mnemonic, [Pure], 1, [3], ["clamp"]>,
+ Arguments<(ins A:$a, B:$b, C:$c,
+ DefaultValuedAttr<I1Attr, "0">:$clamp)> {
+ let results = (outs C:$res);
+ let assemblyFormat = [{
+ $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
+ }];
+ let description = [{
+ Packed intra-lane dot-product with optional result clamping (`clamp`).
+ Computes `res = sum_i a[i]*b[i] + c`, where `a` and `b` hold packed
+ 4/8/16-bit data (for `dot2`,`dot4`,`dot8`).
+
+ Example:
+ ```mlir
+ %r = rocdl.}] # mnemonic # [{ %a, %b, %c {clamp = true} :
+ (}] # A.typeName # [{, }] # B.typeName # [{, }] # C.typeName # [{) -> }]
+ # C.typeName # [{
+ ```
+ }];
+}
+
+class ROCDL_Dot_NoClamp_IntrOp<string mnemonic, ROCDL_NamedType A,
+ ROCDL_NamedType B, ROCDL_NamedType C> :
+ ROCDL_ConcreteNonMemIntrOp<mnemonic, [Pure], 1, [], []>,
+ Arguments<(ins A:$a, B:$b, C:$c)> {
+ let results = (outs C:$res);
+ let assemblyFormat = [{
+ $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
+ }];
+ let description = [{
+ Packed intra-lane dot-product with no clamp control.
+ Computes `res = sum_i a[i]*b[i] + c`. Covers the full-f16/bf16
+ accumulator forms (`fdot2.f16.f16`, `fdot2.bf16.bf16`) and the
+ FP8/BF8 `dot4.f32.*` variants, whose hardware instructions have no
+ CLAMP bit in their modifier word.
+
+ Example:
+ ```mlir
+ %r = rocdl.}] # mnemonic # [{ %a, %b, %c : (}] # A.typeName # [{, }]
+ # B.typeName # [{, }] # C.typeName # [{) -> }] # C.typeName # [{
+ ```
+ }];
+}
+
+class ROCDL_Sudot_IntrOp<string mnemonic> :
+ ROCDL_ConcreteNonMemIntrOp<mnemonic, [Pure], 1, [0, 2, 5],
+ ["signA", "signB", "clamp"]>,
+ Arguments<(ins DefaultValuedAttr<I1Attr, "0">:$signA,
+ I32:$a,
+ DefaultValuedAttr<I1Attr, "0">:$signB,
+ I32:$b,
+ I32:$c,
+ DefaultValuedAttr<I1Attr, "0">:$clamp)> {
+ let results = (outs I32:$res);
+ let assemblyFormat = [{
+ $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
+ }];
+ let description = [{
+ Mixed-signedness packed dot-product with per-operand sign controls.
+ Computes `res = sum_i a[i]*b[i] + c`. Each lane of `a` is treated as
+ signed when `signA = true`; when `signA = false`, the unsigned lane
+ value is zero-extended into a wider signed integer. `signB` controls
+ the same for `b`. `clamp` controls result clamping.
+
+ These ops correspond to RDNA's unified mixed-sign `v_dot4_i32_iu8`
+ and `v_dot8_i32_iu4` instructions (gfx11+).
+
+ Example:
+ ```mlir
+ %r = rocdl.}] # mnemonic # [{ %a, %b, %c
+ {signA = true, signB = false, clamp = true} :
+ (i32, i32, i32) -> i32
+ ```
+ }];
+}
+
+// Available from gfx906.
+def ROCDL_fdot2 : ROCDL_Dot_IntrOp<"fdot2",
+ ROCDL_V2F16Type, ROCDL_V2F16Type, ROCDL_Scalar<F32>>;
+def ROCDL_sdot2 : ROCDL_Dot_IntrOp<"sdot2",
+ ROCDL_V2I16Type, ROCDL_V2I16Type, ROCDL_Scalar<I32>>;
+def ROCDL_udot2 : ROCDL_Dot_IntrOp<"udot2",
+ ROCDL_V2I16Type, ROCDL_V2I16Type, ROCDL_Scalar<I32>>;
+def ROCDL_sdot4 : ROCDL_Dot_IntrOp<"sdot4",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+def ROCDL_udot4 : ROCDL_Dot_IntrOp<"udot4",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+def ROCDL_sdot8 : ROCDL_Dot_IntrOp<"sdot8",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+def ROCDL_udot8 : ROCDL_Dot_IntrOp<"udot8",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<I32>>;
+
+// Available from gfx11.
+def ROCDL_fdot2_f16_f16 : ROCDL_Dot_NoClamp_IntrOp<"fdot2.f16.f16",
+ ROCDL_V2F16Type, ROCDL_V2F16Type, ROCDL_Scalar<F16>>;
+def ROCDL_fdot2_bf16_bf16 : ROCDL_Dot_NoClamp_IntrOp<"fdot2.bf16.bf16",
+ ROCDL_V2BF16Type, ROCDL_V2BF16Type, ROCDL_Scalar<BF16>>;
+def ROCDL_sudot4 : ROCDL_Sudot_IntrOp<"sudot4">;
+def ROCDL_sudot8 : ROCDL_Sudot_IntrOp<"sudot8">;
+
+// Available from gfx11 and gfx950.
+def ROCDL_fdot2_f32_bf16 : ROCDL_Dot_IntrOp<"fdot2.f32.bf16",
+ ROCDL_V2BF16Type, ROCDL_V2BF16Type, ROCDL_Scalar<F32>>;
+
+// Available from gfx12.
+def ROCDL_dot4_f32_fp8_fp8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.fp8.fp8",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+def ROCDL_dot4_f32_fp8_bf8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.fp8.bf8",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+def ROCDL_dot4_f32_bf8_fp8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.bf8.fp8",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+def ROCDL_dot4_f32_bf8_bf8 : ROCDL_Dot_NoClamp_IntrOp<"dot4.f32.bf8.bf8",
+ ROCDL_Scalar<I32>, ROCDL_Scalar<I32>, ROCDL_Scalar<F32>>;
+
+
//===---------------------------------------------------------------------===//
// SWMMAC intrinsics
class ROCDL_SWMMAC_V0_IntrOp<string mnemonic, Type AB, Type CD> : ROCDL_IntrOp<mnemonic,
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 1d835b352e519..1da7b63efdd7e 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1627,6 +1627,107 @@ llvm.func @rocdl.swmmac(%v32f16 : vector<32xf16>, %v32bf16 : vector<32xbf16>,
llvm.return %w32_0 : vector<8xf32>
}
+// -----
+
+// CHECK-LABEL: @rocdl_dot_fdot2_family
+llvm.func @rocdl_dot_fdot2_family(%v2f16: vector<2xf16>, %v2bf16: vector<2xbf16>,
+ %f16: f16, %bf16: bf16, %f32: f32) -> f32 {
+ // CHECK: rocdl.fdot2 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+ %r0 = rocdl.fdot2 %v2f16, %v2f16, %f32 : (vector<2xf16>, vector<2xf16>, f32) -> f32
+ // CHECK: rocdl.fdot2 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+ %r0c = rocdl.fdot2 %v2f16, %v2f16, %f32 {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+
+ // CHECK: rocdl.fdot2.f32.bf16 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+ %r1 = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+ // CHECK: rocdl.fdot2.f32.bf16 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+ %r1c = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 {clamp = true} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+
+ // CHECK: rocdl.fdot2.f16.f16 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xf16>, vector<2xf16>, f16) -> f16
+ %r3 = rocdl.fdot2.f16.f16 %v2f16, %v2f16, %f16 : (vector<2xf16>, vector<2xf16>, f16) -> f16
+
+ // CHECK: rocdl.fdot2.bf16.bf16 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+ %r4 = rocdl.fdot2.bf16.bf16 %v2bf16, %v2bf16, %bf16 : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+
+ llvm.return %r0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: @rocdl_dot_sdot_udot_family
+llvm.func @rocdl_dot_sdot_udot_family(%v2i16: vector<2xi16>, %i32: i32) -> i32 {
+ // CHECK: rocdl.sdot2 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ %r0 = rocdl.sdot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ // CHECK: rocdl.sdot2 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ %r0c = rocdl.sdot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+ // CHECK: rocdl.udot2 %{{.*}}, %{{.*}}, %{{.*}} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ %r1 = rocdl.udot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ // CHECK: rocdl.udot2 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ %r1c = rocdl.udot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+ // CHECK: rocdl.sdot4 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+ %r2 = rocdl.sdot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sdot4 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+ %r2c = rocdl.sdot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: rocdl.udot4 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+ %r3 = rocdl.udot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: rocdl.udot4 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+ %r3c = rocdl.udot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: rocdl.sdot8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+ %r4 = rocdl.sdot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sdot8 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+ %r4c = rocdl.sdot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: rocdl.udot8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+ %r5 = rocdl.udot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: rocdl.udot8 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true} : (i32, i32, i32) -> i32
+ %r5c = rocdl.udot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ llvm.return %r0 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @rocdl_dot_sudot_family
+llvm.func @rocdl_dot_sudot_family(%i32: i32) -> i32 {
+ // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+ %r0 = rocdl.sudot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} {signA = true} : (i32, i32, i32) -> i32
+ %r0a = rocdl.sudot4 %i32, %i32, %i32 {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} {signB = true} : (i32, i32, i32) -> i32
+ %r0b = rocdl.sudot4 %i32, %i32, %i32 {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sudot4 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true, signA = true, signB = true} : (i32, i32, i32) -> i32
+ %r0c = rocdl.sudot4 %i32, %i32, %i32 {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> i32
+ %r1 = rocdl.sudot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} {signA = true} : (i32, i32, i32) -> i32
+ %r1a = rocdl.sudot8 %i32, %i32, %i32 {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} {signB = true} : (i32, i32, i32) -> i32
+ %r1b = rocdl.sudot8 %i32, %i32, %i32 {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: rocdl.sudot8 %{{.*}}, %{{.*}}, %{{.*}} {clamp = true, signA = true, signB = true} : (i32, i32, i32) -> i32
+ %r1c = rocdl.sudot8 %i32, %i32, %i32 {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+ llvm.return %r0 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @rocdl_dot_fp8_family
+llvm.func @rocdl_dot_fp8_family(%i32: i32, %f32: f32) -> f32 {
+ // CHECK: rocdl.dot4.f32.fp8.fp8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+ %r0 = rocdl.dot4.f32.fp8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+ // CHECK: rocdl.dot4.f32.fp8.bf8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+ %r1 = rocdl.dot4.f32.fp8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+ // CHECK: rocdl.dot4.f32.bf8.fp8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+ %r2 = rocdl.dot4.f32.bf8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+ // CHECK: rocdl.dot4.f32.bf8.bf8 %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, f32) -> f32
+ %r3 = rocdl.dot4.f32.bf8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+
+ llvm.return %r0 : f32
+}
// -----
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 483d8896383c3..169702efd312f 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -2098,6 +2098,100 @@ llvm.func @rocdl.cvt.scalef32.sr.pk16(%v16xf32: vector<16xf32>,
llvm.return
}
+// CHECK-LABEL: @rocdl_dot_fdot2_family
+llvm.func @rocdl_dot_fdot2_family(%v2f16: vector<2xf16>, %v2bf16: vector<2xbf16>,
+ %f16: f16, %bf16: bf16, %f32: f32) -> f32 {
+ // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}, i1 false)
+ %r0 = rocdl.fdot2 %v2f16, %v2f16, %f32 : (vector<2xf16>, vector<2xf16>, f32) -> f32
+ // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}, i1 true)
+ %r0c = rocdl.fdot2 %v2f16, %v2f16, %f32 {clamp = true} : (vector<2xf16>, vector<2xf16>, f32) -> f32
+
+ // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %{{.*}}, <2 x bfloat> %{{.*}}, float %{{.*}}, i1 false)
+ %r1 = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+ // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %{{.*}}, <2 x bfloat> %{{.*}}, float %{{.*}}, i1 true)
+ %r1c = rocdl.fdot2.f32.bf16 %v2bf16, %v2bf16, %f32 {clamp = true} : (vector<2xbf16>, vector<2xbf16>, f32) -> f32
+
+ // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, half %{{.*}})
+ %r3 = rocdl.fdot2.f16.f16 %v2f16, %v2f16, %f16 : (vector<2xf16>, vector<2xf16>, f16) -> f16
+
+ // CHECK: call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %{{.*}}, <2 x bfloat> %{{.*}}, bfloat %{{.*}})
+ %r4 = rocdl.fdot2.bf16.bf16 %v2bf16, %v2bf16, %bf16 : (vector<2xbf16>, vector<2xbf16>, bf16) -> bf16
+
+ llvm.return %r0 : f32
+}
+
+// CHECK-LABEL: @rocdl_dot_sdot_udot_family
+llvm.func @rocdl_dot_sdot_udot_family(%v2i16: vector<2xi16>, %i32: i32) -> i32 {
+ // CHECK: call i32 @llvm.amdgcn.sdot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 false)
+ %r0 = rocdl.sdot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sdot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 true)
+ %r0c = rocdl.sdot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+ // CHECK: call i32 @llvm.amdgcn.udot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 false)
+ %r1 = rocdl.udot2 %v2i16, %v2i16, %i32 : (vector<2xi16>, vector<2xi16>, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.udot2(<2 x i16> %{{.*}}, <2 x i16> %{{.*}}, i32 %{{.*}}, i1 true)
+ %r1c = rocdl.udot2 %v2i16, %v2i16, %i32 {clamp = true} : (vector<2xi16>, vector<2xi16>, i32) -> i32
+
+ // CHECK: call i32 @llvm.amdgcn.sdot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r2 = rocdl.sdot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sdot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+ %r2c = rocdl.sdot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: call i32 @llvm.amdgcn.udot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r3 = rocdl.udot4 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.udot4(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+ %r3c = rocdl.udot4 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: call i32 @llvm.amdgcn.sdot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r4 = rocdl.sdot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sdot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+ %r4c = rocdl.sdot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: call i32 @llvm.amdgcn.udot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r5 = rocdl.udot8 %i32, %i32, %i32 : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.udot8(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+ %r5c = rocdl.udot8 %i32, %i32, %i32 {clamp = true} : (i32, i32, i32) -> i32
+
+ llvm.return %r0 : i32
+}
+
+// CHECK-LABEL: @rocdl_dot_sudot_family
+llvm.func @rocdl_dot_sudot_family(%a: i32, %b: i32, %c: i32) -> i32 {
+ // CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r0 = rocdl.sudot4 %a, %b, %c : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r0a = rocdl.sudot4 %a, %b, %c {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r0b = rocdl.sudot4 %a, %b, %c {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+ %r0c = rocdl.sudot4 %a, %b, %c {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+ // CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r1 = rocdl.sudot8 %a, %b, %c : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %{{.*}}, i1 false, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r1a = rocdl.sudot8 %a, %b, %c {signA = true, signB = false, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 false)
+ %r1b = rocdl.sudot8 %a, %b, %c {signA = false, signB = true, clamp = false} : (i32, i32, i32) -> i32
+ // CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %{{.*}}, i1 true, i32 %{{.*}}, i32 %{{.*}}, i1 true)
+ %r1c = rocdl.sudot8 %a, %b, %c {signA = true, signB = true, clamp = true} : (i32, i32, i32) -> i32
+
+ llvm.return %r0 : i32
+}
+
+// CHECK-LABEL: @rocdl_dot_fp8_family
+llvm.func @rocdl_dot_fp8_family(%i32: i32, %f32: f32) -> f32 {
+ // CHECK: call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+ %r0 = rocdl.dot4.f32.fp8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+ // CHECK: call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+ %r1 = rocdl.dot4.f32.fp8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+ // CHECK: call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+ %r2 = rocdl.dot4.f32.bf8.fp8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+ // CHECK: call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %{{.*}}, i32 %{{.*}}, float %{{.*}})
+ %r3 = rocdl.dot4.f32.bf8.bf8 %i32, %i32, %f32 : (i32, i32, f32) -> f32
+
+ llvm.return %r0 : f32
+}
+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
More information about the Mlir-commits
mailing list