[Mlir-commits] [mlir] [mlir][rocdl] Add cvt scale intrinsics (PR #159649)
Erick Ochoa Lopez
llvmlistbot at llvm.org
Thu Sep 18 14:10:59 PDT 2025
https://github.com/amd-eochoalo updated https://github.com/llvm/llvm-project/pull/159649
>From 804c425475d7ad3109aedee6cbe45df428d0b77a Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 18 Sep 2025 08:37:27 -0400
Subject: [PATCH 1/4] [mlir][rocdl] Scaled fp4,bf8,fp8 to bf16,f16,f32
conversion intrinsics
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 32 ++++++++++++++++++++
mlir/test/Dialect/LLVMIR/rocdl.mlir | 29 ++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 9fa3ec1fc4b21..9137f5214877c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -835,10 +835,14 @@ class ROCDL_ConcreteVector<Type elem, int length> :
def ROCDL_V2I16Type : ROCDL_ConcreteVector<I16, 2>;
def ROCDL_V2F16Type : ROCDL_ConcreteVector<F16, 2>;
+def ROCDL_V2I32Type : ROCDL_ConcreteVector<I32, 2>;
def ROCDL_V2BF16Type : ROCDL_ConcreteVector<BF16, 2>;
def ROCDL_V2F32Type : ROCDL_ConcreteVector<F32, 2>;
def ROCDL_V6I32Type : ROCDL_ConcreteVector<I32, 6>;
def ROCDL_V8I32Type : ROCDL_ConcreteVector<I32, 8>;
+def ROCDL_V8BF16Type : ROCDL_ConcreteVector<BF16, 8>;
+def ROCDL_V8F16Type : ROCDL_ConcreteVector<F16, 8>;
+def ROCDL_V8F32Type : ROCDL_ConcreteVector<F32, 8>;
def ROCDL_V16F32Type : ROCDL_ConcreteVector<F32, 16>;
def ROCDL_V32F16Type : ROCDL_ConcreteVector<F16, 32>;
def ROCDL_V32BF16Type : ROCDL_ConcreteVector<BF16, 32>;
@@ -975,6 +979,34 @@ class ScaleArgInfo<TypeConstraint argTyVal, string typeName> {
string nameForOp = typeName;
}
+//===---------------------------------------------------------------------===//
+// Scaled {fp4,bf8,fp8} to {bf16,f16,f32} conversion intrinsics
+//===---------------------------------------------------------------------===//
+
+foreach smallT = [
+ ScaleArgInfo<I32, "Fp4">,
+ ScaleArgInfo<ROCDL_V2I32Type, "Fp8">,
+ ScaleArgInfo<ROCDL_V2I32Type, "Bf8">
+] in {
+ foreach largeT = [
+ ScaleArgInfo<ROCDL_V8F16Type, "F16">,
+ ScaleArgInfo<ROCDL_V8BF16Type, "Bf16">,
+ ScaleArgInfo<ROCDL_V8F32Type, "F32">,
+ ] in {
+ def ROCDL_CvtPkScalePk8 # largeT.nameForOp # smallT.nameForOp # Op :
+ ROCDL_ConcreteNonMemIntrOp<"cvt.scale.pk8." # largeT.name # "." # smallT.name,
+ [Pure], 1, [2], ["scaleSel"]>,
+ Arguments<(ins smallT.type:$src, I32:$scale, I32Attr:$scaleSel)> {
+
+ let summary = "Scales 8 " # smallT.name # " and converts them to 8 " # largeT.name # ".";
+ let results = (outs largeT.type:$res);
+ let assemblyFormat = [{
+ attr-dict $src `,` $scale `[` $scaleSel `]` `:` type($res)
+ }];
+ }
+ } // foreach largeT
+} // foreach smallTOp
+
//===---------------------------------------------------------------------===//
// Scaled 32x6-bit float float conversion intrinsics
//===---------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 782ef4e154440..5b708204a6a7a 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1025,6 +1025,35 @@ llvm.func @rocdl.permlane32.swap(%src : i32) -> !llvm.struct<(i32, i32)> {
// -----
+// CHECK-LABEL: rocdl.cvt.scale.pk8
+llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
+
+ // CHECK: rocdl.cvt.scale.pk8.f16.fp4
+ %0 = rocdl.cvt.scale.pk8.f16.fp4 %i32, %scale[0] : vector<8xf16>
+ // CHECK: rocdl.cvt.scale.pk8.bf16.fp4
+ %1 = rocdl.cvt.scale.pk8.bf16.fp4 %i32, %scale[0] : vector<8xbf16>
+ // CHECK: rocdl.cvt.scale.pk8.f32.fp4
+ %2 = rocdl.cvt.scale.pk8.f32.fp4 %i32, %scale[0] : vector<8xf32>
+
+ // CHECK: rocdl.cvt.scale.pk8.f16.fp8
+ %3 = rocdl.cvt.scale.pk8.f16.fp8 %v2xi32, %scale[0] : vector<8xf16>
+ // CHECK: rocdl.cvt.scale.pk8.bf16.fp8
+ %4 = rocdl.cvt.scale.pk8.bf16.fp8 %v2xi32, %scale[0] : vector<8xbf16>
+ // CHECK: rocdl.cvt.scale.pk8.f32.fp8
+ %5 = rocdl.cvt.scale.pk8.f32.fp8 %v2xi32, %scale[0] : vector<8xf32>
+
+ // CHECK: rocdl.cvt.scale.pk8.f16.bf8
+ %6 = rocdl.cvt.scale.pk8.f16.bf8 %v2xi32, %scale[0] : vector<8xf16>
+ // CHECK: rocdl.cvt.scale.pk8.bf16.bf8
+ %7 = rocdl.cvt.scale.pk8.bf16.bf8 %v2xi32, %scale[0] : vector<8xbf16>
+ // CHECK: rocdl.cvt.scale.pk8.f32.bf8
+ %8 = rocdl.cvt.scale.pk8.f32.bf8 %v2xi32, %scale[0] : vector<8xf32>
+
+ llvm.return
+}
+
+// -----
+
// expected-error at below {{attribute attached to unexpected op}}
func.func private @expected_llvm_func() attributes { rocdl.kernel }
>From ea5510918a9dae2cc005e7a5e6a1613a434d7c40 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 18 Sep 2025 16:09:00 -0400
Subject: [PATCH 2/4] [mlir][rocdl] Scaled bf6,fp6 to bf16,f16,f32 conversion
intrinsics.
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 30 ++++++++++++++++++++
mlir/test/Dialect/LLVMIR/rocdl.mlir | 22 ++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 9137f5214877c..1252d8589cc63 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -838,11 +838,14 @@ def ROCDL_V2F16Type : ROCDL_ConcreteVector<F16, 2>;
def ROCDL_V2I32Type : ROCDL_ConcreteVector<I32, 2>;
def ROCDL_V2BF16Type : ROCDL_ConcreteVector<BF16, 2>;
def ROCDL_V2F32Type : ROCDL_ConcreteVector<F32, 2>;
+def ROCDL_V3I32Type : ROCDL_ConcreteVector<I32, 3>;
def ROCDL_V6I32Type : ROCDL_ConcreteVector<I32, 6>;
def ROCDL_V8I32Type : ROCDL_ConcreteVector<I32, 8>;
def ROCDL_V8BF16Type : ROCDL_ConcreteVector<BF16, 8>;
def ROCDL_V8F16Type : ROCDL_ConcreteVector<F16, 8>;
def ROCDL_V8F32Type : ROCDL_ConcreteVector<F32, 8>;
+def ROCDL_V16BF16Type : ROCDL_ConcreteVector<BF16, 16>;
+def ROCDL_V16F16Type : ROCDL_ConcreteVector<F16, 16>;
def ROCDL_V16F32Type : ROCDL_ConcreteVector<F32, 16>;
def ROCDL_V32F16Type : ROCDL_ConcreteVector<F16, 32>;
def ROCDL_V32BF16Type : ROCDL_ConcreteVector<BF16, 32>;
@@ -1007,6 +1010,33 @@ foreach smallT = [
} // foreach largeT
} // foreach smallTOp
+//===---------------------------------------------------------------------===//
+// Scaled {bf6,fp6} to {bf16,f16,f32} conversion intrinsics
+//===---------------------------------------------------------------------===//
+
+foreach smallT = [
+ ScaleArgInfo<ROCDL_V3I32Type, "Fp6">,
+ ScaleArgInfo<ROCDL_V3I32Type, "Bf6">
+] in {
+ foreach largeT = [
+ ScaleArgInfo<ROCDL_V16F16Type, "F16">,
+ ScaleArgInfo<ROCDL_V16BF16Type, "Bf16">,
+ ScaleArgInfo<ROCDL_V16F32Type, "F32">,
+ ] in {
+ def ROCDL_CvtPkScalePk16 # largeT.nameForOp # smallT.nameForOp # Op :
+ ROCDL_ConcreteNonMemIntrOp<"cvt.scale.pk16." # largeT.name # "." # smallT.name,
+ [Pure], 1, [2], ["scaleSel"]>,
+ Arguments<(ins smallT.type:$src, I32:$scale, I32Attr:$scaleSel)> {
+
+ let summary = "Scales 16 " # smallT.name # " and converts them to 16 " # largeT.name # ".";
+ let results = (outs largeT.type:$res);
+ let assemblyFormat = [{
+ attr-dict $src `,` $scale `[` $scaleSel `]` `:` type($res)
+ }];
+ }
+ } // foreach largeT
+} // foreach smallTOp
+
//===---------------------------------------------------------------------===//
// Scaled 32x6-bit float float conversion intrinsics
//===---------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 5b708204a6a7a..959bb35302b20 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1054,6 +1054,28 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
// -----
+// CHECK-LABEL: rocdl.cvt.scale.pk16
+llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
+
+ // CHECK: rocdl.cvt.scale.pk16.f16.fp6
+ %0 = rocdl.cvt.scale.pk16.f16.fp6 %v3xi32, %scale[0] : vector<16xf16>
+ // CHECK: rocdl.cvt.scale.pk16.bf16.fp6
+ %1 = rocdl.cvt.scale.pk16.bf16.fp6 %v3xi32, %scale[0] : vector<16xbf16>
+ // CHECK: rocdl.cvt.scale.pk16.f32.fp6
+ %2 = rocdl.cvt.scale.pk16.f32.fp6 %v3xi32, %scale[0] : vector<16xf32>
+
+ // CHECK: rocdl.cvt.scale.pk16.f16.bf6
+ %3 = rocdl.cvt.scale.pk16.f16.bf6 %v3xi32, %scale[0] : vector<16xf16>
+ // CHECK: rocdl.cvt.scale.pk16.bf16.bf6
+ %4 = rocdl.cvt.scale.pk16.bf16.bf6 %v3xi32, %scale[0] : vector<16xbf16>
+ // CHECK: rocdl.cvt.scale.pk16.f32.bf6
+ %5 = rocdl.cvt.scale.pk16.f32.bf6 %v3xi32, %scale[0] : vector<16xf32>
+
+ llvm.return
+}
+
+// -----
+
// expected-error at below {{attribute attached to unexpected op}}
func.func private @expected_llvm_func() attributes { rocdl.kernel }
>From ceec3957865ce2fb132da1012deaccd4026e2835 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 18 Sep 2025 16:59:06 -0400
Subject: [PATCH 3/4] [mlir] Add test for cvt.scale.pk16 lowerings
---
mlir/test/Target/LLVMIR/rocdl.mlir | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index a464358250c38..1cc17b41239cb 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1298,6 +1298,26 @@ llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 {
llvm.return %ret : i32
}
+// CHECK-LABEL: @rocdl.cvt.scale.pk16
+// CHECK-SAME:(<3 x i32> %[[SRC0:.+]], i32 %[[SCALE:.+]])
+llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
+
+ // CHECK: call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %[[SRC0]], i32 %[[SCALE]], i32 0)
+ %0 = rocdl.cvt.scale.pk16.f16.fp6 %v3xi32, %scale[0] : vector<16xf16>
+ // CHECK: call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %[[SRC0]], i32 %[[SCALE]], i32 0)
+ %1 = rocdl.cvt.scale.pk16.bf16.fp6 %v3xi32, %scale[0] : vector<16xbf16>
+ // CHECK: call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> %[[SRC0]], i32 %[[SCALE]], i32 0)
+ %2 = rocdl.cvt.scale.pk16.f32.fp6 %v3xi32, %scale[0] : vector<16xf32>
+ // CHECK: call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %[[SRC0]], i32 %[[SCALE]], i32 0)
+ %3 = rocdl.cvt.scale.pk16.f16.bf6 %v3xi32, %scale[0] : vector<16xf16>
+ // CHECK: call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %[[SRC0]], i32 %[[SCALE]], i32 0)
+ %4 = rocdl.cvt.scale.pk16.bf16.bf6 %v3xi32, %scale[0] : vector<16xbf16>
+ // CHECK: call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %[[SRC0]], i32 %[[SCALE]], i32 0)
+ %5 = rocdl.cvt.scale.pk16.f32.bf6 %v3xi32, %scale[0] : vector<16xf32>
+
+ llvm.return
+}
+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
>From b1c72d4fb0fe6f0737956ed18e7aa8a861799606 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 18 Sep 2025 17:09:49 -0400
Subject: [PATCH 4/4] [mlir] Add test for cvt.scale.pk8 lowerings
---
mlir/test/Target/LLVMIR/rocdl.mlir | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 1cc17b41239cb..bf18db99f6cf2 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1298,6 +1298,34 @@ llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 {
llvm.return %ret : i32
}
+// CHECK-LABEL: rocdl.cvt.scale.pk8
+// CHECK-SAME:(i32 %[[I32:.+]], <2 x i32> %[[V2I32:.+]], i32 [[SCALE:.+]])
+llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
+
+ // CHECK: call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[I32]], i32 [[SCALE]], i32 0)
+ %0 = rocdl.cvt.scale.pk8.f16.fp4 %i32, %scale[0] : vector<8xf16>
+ // CHECK: call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 [[I32]], i32 [[SCALE]], i32 0)
+ %1 = rocdl.cvt.scale.pk8.bf16.fp4 %i32, %scale[0] : vector<8xbf16>
+ // CHECK: call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[I32]], i32 [[SCALE]], i32 0)
+ %2 = rocdl.cvt.scale.pk8.f32.fp4 %i32, %scale[0] : vector<8xf32>
+
+ // CHECK: call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[V2I32]], i32 [[SCALE]], i32 0)
+ %3 = rocdl.cvt.scale.pk8.f16.fp8 %v2xi32, %scale[0] : vector<8xf16>
+ // CHECK: call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> [[V2I32]], i32 [[SCALE]], i32 0)
+ %4 = rocdl.cvt.scale.pk8.bf16.fp8 %v2xi32, %scale[0] : vector<8xbf16>
+ // CHECK: call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[V2I32]], i32 [[SCALE]], i32 0)
+ %5 = rocdl.cvt.scale.pk8.f32.fp8 %v2xi32, %scale[0] : vector<8xf32>
+
+ // CHECK: call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[V2I32]], i32 [[SCALE]], i32 0)
+ %6 = rocdl.cvt.scale.pk8.f16.bf8 %v2xi32, %scale[0] : vector<8xf16>
+ // CHECK: call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> [[V2I32]], i32 [[SCALE]], i32 0)
+ %7 = rocdl.cvt.scale.pk8.bf16.bf8 %v2xi32, %scale[0] : vector<8xbf16>
+ // CHECK: call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[V2I32]], i32 [[SCALE]], i32 0)
+ %8 = rocdl.cvt.scale.pk8.f32.bf8 %v2xi32, %scale[0] : vector<8xf32>
+
+ llvm.return
+}
+
// CHECK-LABEL: @rocdl.cvt.scale.pk16
// CHECK-SAME:(<3 x i32> %[[SRC0:.+]], i32 %[[SCALE:.+]])
llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
More information about the Mlir-commits
mailing list