[Mlir-commits] [mlir] [mlir][amdgpu] Fix documentation and verifiers (PR #167369)
Erick Ochoa Lopez
llvmlistbot at llvm.org
Mon Nov 10 10:51:15 PST 2025
https://github.com/amd-eochoalo created https://github.com/llvm/llvm-project/pull/167369
None
>From 7b64631ffa0f4e2880d0a839443e450120da7a68 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 10 Nov 2025 13:23:20 -0500
Subject: [PATCH 1/2] Update documentation
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 35 +++++++++++++------
1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 45cb67f0eee4a..4820b7a747ac2 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPacked816Op
FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
- ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<3>]>:$firstScaleByte)>,
Results<(
outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
FixedVectorOfShapeAndType<[8], F16>,
@@ -139,17 +139,21 @@ def AMDGPU_ScaledExtPacked816Op
let summary = "Extend a vector of packed floating point values";
let description = [{
- The scales applied to the input microfloats are stored in two bytes which
+ The scales applied to the input microfloats are stored in bytes which
come from the `scales` input provided in a *half* of the wave identified
- by `firstScaleLane`. The pair of bytes used is selected by
- `firstScaleByte`. The 16 vectors in consecutive lanes starting from
+ by `firstScaleLane`. The bytes used is selected by `firstScaleByte` and depends
+ on the type of `source`. The 16 vectors in consecutive lanes starting from
`firstScaleLane` (which we'll call the scale vectors) will be used by both
- halves of the wave (with lane L reading from L % 16'th scale vector), but
- each half will use a different byte.
+ halves of the wave (with lane L reading from L % 16'th scale vector).
+
+ When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN each half of the
+ wave will use a different byte. The first one being `firstScaleByte` and
+ the second one being `firstScaleByte` + 1. When the block size is 32,
+ `firstScaleByte` can be either 0 or 2, selecting halves of the scale vectors.
+ Lanes 0-15 will read from `firstScaleByte` and lanes 16-31 will read
+ from `firstScaleByte` + 1.
+
- When the block size is 32, `firstScaleByte` can be either 0 or 2,
- selecting halves of the scale vectors. Lanes 0-15 will read from
- `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
For example:
```mlir
// Input: 8-element vector of F8E4M3FN, converting to F32
@@ -165,7 +169,8 @@ def AMDGPU_ScaledExtPacked816Op
: vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
```
- However, when the block size is 16, `firstScaleByte` can be 0 or 1.
+ When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN and
+ the block size is 16, `firstScaleByte` can be 0 or 1.
Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
while lanes 16-31 read from `firstScaleByte` + 2.
For example:
@@ -187,6 +192,16 @@ def AMDGPU_ScaledExtPacked816Op
instructions use for matix scales. These selection operands allows
one to choose portions of the matrix to convert.
+ When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 32,
+ then the same byte will be used by both halves of the wave.
+ In this case, `firstScaleByte` can be any value from 0 to 3.
+
+ When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 16,
+ following combinations are allowed:
+ * `firstScaleLane(0), firstScaleByte(0)`
+ * `firstScaleLane(1), firstScaleByte(2)`
+ all other combinations are reserved.
+
Available on gfx1250+.
}];
>From 08e96b19369451dd5ec4e72ed2905bd0b2e0cf71 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 10 Nov 2025 13:44:08 -0500
Subject: [PATCH 2/2] Fix verifiers
---
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 23 +++++++++++++++-----
mlir/test/Dialect/AMDGPU/invalid.mlir | 20 ++++++++++++-----
2 files changed, 32 insertions(+), 11 deletions(-)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index df955fc90b45f..5c35823678576 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -344,14 +344,27 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
LogicalResult ScaledExtPacked816Op::verify() {
int blockSize = getBlockSize();
assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+
int firstScaleByte = getFirstScaleByte();
- if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) {
- return emitOpError(
- "blockSize of 16 can only have firstScaleByte be 0 or 1.");
+ auto sourceType = cast<VectorType>(getSource().getType());
+ Type elementType = sourceType.getElementType();
+ auto floatType = cast<FloatType>(elementType);
+ int bitWidth = floatType.getWidth();
+
+ if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 &&
+ !llvm::is_contained({0, 1}, firstScaleByte)) {
+ return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 "
+ "for f4 and f6.");
+ }
+ if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 &&
+ !llvm::is_contained({0, 2}, firstScaleByte)) {
+ return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 "
+ "for f4 and f6.");
}
- if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) {
+ if (bitWidth == 8 && blockSize == 16 &&
+ !llvm::is_contained({0, 2}, firstScaleByte)) {
return emitOpError(
- "blockSize of 32 can only have firstScaleByte be 0 or 2.");
+ "blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.");
}
return success();
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 4c6f62a045405..5c8cc8b67c4b3 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -333,17 +333,25 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
// -----
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
- // expected-error at +1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}}
- %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error at +1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
func.return
}
// -----
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
- // expected-error at +1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}}
- %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error at +1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+ func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error at +1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
func.return
}
More information about the Mlir-commits
mailing list