[Mlir-commits] [mlir] [mlir][amdgpu] Add scaled_ext_packed{8, 16} operations (PR #159830)
Erick Ochoa Lopez
llvmlistbot at llvm.org
Thu Oct 16 11:17:08 PDT 2025
https://github.com/amd-eochoalo updated https://github.com/llvm/llvm-project/pull/159830
>From 9c09c35633e98f47ebe5ad8c15659e01ef6664cc Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Fri, 19 Sep 2025 14:33:14 -0400
Subject: [PATCH 1/4] [mlir][amdgpu] Add scaled_ext_packed{8,16} operations
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 50 ++++++++++++-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 70 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 55 +++++++++++++++
3 files changed, 174 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 8370d350afd1e..5cb1486690464 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,54 @@ def AMDGPU_ExtPackedFp8Op :
}];
}
+def IsValidBlockSize: AttrConstraint<
+ CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() == 16 || ::llvm::cast<::mlir::IntegerAttr>($_self).getInt() == 32">,
+ "whose value is 16 or 32">;
+
+def AMDGPU_ScaledExtPacked816Op
+ : AMDGPU_Op<"scaled_ext_packed816", [Pure]>,
+ Arguments<(
+ ins AnyTypeOf<[VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>,
+ VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source,
+ FixedVectorOfLengthAndType<[4], [F8E8M0FNU]>:$scale,
+ ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>,
+ FixedVectorOfLengthAndType<[8], [F16]>,
+ FixedVectorOfLengthAndType<[8], [BF16]>,
+ FixedVectorOfLengthAndType<[16], [F32]>,
+ FixedVectorOfLengthAndType<[16], [F16]>,
+ FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> {
+
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ The scales applied to the input microfloats are stored in two bytes which
+ come from the `scales` input provided in a *half* of the wave identified
+ by `firstScaleLane`. The pair of bytes used is selected by
+ `firstScaleByte`. The 16 vectors in consecutive lanes starting from
+ `firstScaleLane` (which we'll call the scale vectors) will be used by both
+ halves of the wave (with lane L reading from L % 16'th scale vector), but
+ each half will use a different byte.
+
+ When the block size is 32, `firstScaleByte` can be either 0 or 2,
+ selecting halves of the scale vectors. Lanes 0-15 will read from
+ `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
+
+ However, when the block size is 16, `firstScaleByte` can be 0 or 1.
+ Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
+ while lanes 16-31 read from `firstScaleByte` + 2.
+
+ Note: the layout for the scales generally mirrors how the WMMA
+ instructions use for matix scales. These selection operands allows
+ one to choose portions of the matrix to convert.
+ }];
+
+ let hasCustomAssemblyFormat = 1;
+}
+
def AMDGPU_ScaledExtPackedOp
: AMDGPU_Op<"scaled_ext_packed", [Pure]>,
Arguments<(
@@ -860,7 +908,7 @@ def AMDGPU_MFMAOp :
based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
types of the source and destination arguments.
- For information on the layouts of the input and output matrces (which are stored
+ For information on the layouts of the input and output matrices (which are stored
in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation.
The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index f405d0cc7aa02..33b0131bd4ca9 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -338,6 +338,76 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
context);
}
+//===----------------------------------------------------------------------===//
+// ScaledExtPacked816Op
+//===----------------------------------------------------------------------===//
+mlir::ParseResult ScaledExtPacked816Op::parse(mlir::OpAsmParser &parser,
+ mlir::OperationState &result) {
+ // Parse attributes
+ if (parser.parseOptionalAttrDict(result.attributes))
+ return failure();
+
+ // Parse source operand
+ OpAsmParser::UnresolvedOperand source;
+ if (parser.parseOperand(source))
+ return failure();
+
+ if (parser.parseKeyword("scale") || parser.parseLParen())
+ return failure();
+ OpAsmParser::UnresolvedOperand scale;
+ if (parser.parseOperand(scale) || parser.parseRParen())
+ return failure();
+
+ // Parse attributes
+ IntegerAttr blockSize, firstScaleLane, firstScaleByte;
+ if (parser.parseKeyword("blockSize") || parser.parseLParen() ||
+ parser.parseAttribute(blockSize, parser.getBuilder().getI32Type()) ||
+ parser.parseRParen())
+ return failure();
+
+ if (parser.parseKeyword("firstScaleLane") || parser.parseLParen() ||
+ parser.parseAttribute(firstScaleLane, parser.getBuilder().getI32Type()) ||
+ parser.parseRParen())
+ return failure();
+
+ if (parser.parseKeyword("firstScaleByte") || parser.parseLParen() ||
+ parser.parseAttribute(firstScaleByte, parser.getBuilder().getI32Type()) ||
+ parser.parseRParen())
+ return failure();
+
+ Type sourceType, resultType;
+ if (parser.parseColon() || parser.parseType(sourceType) ||
+ parser.parseKeyword("to") || parser.parseType(resultType))
+ return failure();
+
+ // Resolve operands with types
+ Type scaleType =
+ VectorType::get({4}, Float8E8M0FNUType::get(parser.getContext()));
+ if (parser.resolveOperand(source, sourceType, result.operands) ||
+ parser.resolveOperand(scale, scaleType, result.operands))
+ return failure();
+
+ result.addAttribute("blockSize", blockSize);
+ result.addAttribute("firstScaleLane", firstScaleLane);
+ result.addAttribute("firstScaleByte", firstScaleByte);
+
+ result.addTypes(resultType);
+ return success();
+}
+
+void ScaledExtPacked816Op::print(OpAsmPrinter &p) {
+ p << " ";
+ p.printOptionalAttrDict(
+ (*this)->getAttrs(),
+ /*elideAttrs=*/{"blockSize", "firstScaleLane", "firstScaleByte"});
+ p << " " << getSource();
+ p << " scale(" << getScale() << ")";
+ p << " blockSize(" << getBlockSize() << ")";
+ p << " firstScaleLane(" << getFirstScaleLane() << ")";
+ p << " firstScaleByte(" << getFirstScaleByte() << ")";
+ p << " : " << getSource().getType() << " to " << getRes().getType();
+}
+
//===----------------------------------------------------------------------===//
// WMMAOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 8f427e9d56f45..316a79c03aaba 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
func.return %ret : vector<2xbf16>
}
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp4
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp8
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf8
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp6
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf16
+func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
>From f8b11c4affb0d2667f7360582cea0da890803a22 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 16 Oct 2025 10:32:12 -0400
Subject: [PATCH 2/4] Use TypesMatchWith and make the scale a constant type
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 20 +++++-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 70 -------------------
2 files changed, 18 insertions(+), 72 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 5cb1486690464..6f9cf1825c5c2 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -117,7 +117,9 @@ def IsValidBlockSize: AttrConstraint<
"whose value is 16 or 32">;
def AMDGPU_ScaledExtPacked816Op
- : AMDGPU_Op<"scaled_ext_packed816", [Pure]>,
+ : AMDGPU_Op<"scaled_ext_packed816", [Pure, TypesMatchWith<"scale type is fixed",
+ "source", "scale",
+ "ScaledExtPacked816Op::getScaleType($_self.getContext())">]>,
Arguments<(
ins AnyTypeOf<[VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>,
VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source,
@@ -157,7 +159,21 @@ def AMDGPU_ScaledExtPacked816Op
one to choose portions of the matrix to convert.
}];
- let hasCustomAssemblyFormat = 1;
+ let assemblyFormat = [{
+ attr-dict $source
+ `scale` `(` $scale `)`
+ `blockSize` `(` $blockSize `)`
+ `firstScaleLane` `(` $firstScaleLane`)`
+ `firstScaleByte` `(` $firstScaleByte `)`
+ `:` type($source) `to` type($res)
+ }];
+
+ let extraClassDeclaration = [{
+ static Type getScaleType(MLIRContext *ctx) {
+ return VectorType::get(4, Float8E8M0FNUType::get(ctx));
+ }
+ }];
+
}
def AMDGPU_ScaledExtPackedOp
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 33b0131bd4ca9..f405d0cc7aa02 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -338,76 +338,6 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
context);
}
-//===----------------------------------------------------------------------===//
-// ScaledExtPacked816Op
-//===----------------------------------------------------------------------===//
-mlir::ParseResult ScaledExtPacked816Op::parse(mlir::OpAsmParser &parser,
- mlir::OperationState &result) {
- // Parse attributes
- if (parser.parseOptionalAttrDict(result.attributes))
- return failure();
-
- // Parse source operand
- OpAsmParser::UnresolvedOperand source;
- if (parser.parseOperand(source))
- return failure();
-
- if (parser.parseKeyword("scale") || parser.parseLParen())
- return failure();
- OpAsmParser::UnresolvedOperand scale;
- if (parser.parseOperand(scale) || parser.parseRParen())
- return failure();
-
- // Parse attributes
- IntegerAttr blockSize, firstScaleLane, firstScaleByte;
- if (parser.parseKeyword("blockSize") || parser.parseLParen() ||
- parser.parseAttribute(blockSize, parser.getBuilder().getI32Type()) ||
- parser.parseRParen())
- return failure();
-
- if (parser.parseKeyword("firstScaleLane") || parser.parseLParen() ||
- parser.parseAttribute(firstScaleLane, parser.getBuilder().getI32Type()) ||
- parser.parseRParen())
- return failure();
-
- if (parser.parseKeyword("firstScaleByte") || parser.parseLParen() ||
- parser.parseAttribute(firstScaleByte, parser.getBuilder().getI32Type()) ||
- parser.parseRParen())
- return failure();
-
- Type sourceType, resultType;
- if (parser.parseColon() || parser.parseType(sourceType) ||
- parser.parseKeyword("to") || parser.parseType(resultType))
- return failure();
-
- // Resolve operands with types
- Type scaleType =
- VectorType::get({4}, Float8E8M0FNUType::get(parser.getContext()));
- if (parser.resolveOperand(source, sourceType, result.operands) ||
- parser.resolveOperand(scale, scaleType, result.operands))
- return failure();
-
- result.addAttribute("blockSize", blockSize);
- result.addAttribute("firstScaleLane", firstScaleLane);
- result.addAttribute("firstScaleByte", firstScaleByte);
-
- result.addTypes(resultType);
- return success();
-}
-
-void ScaledExtPacked816Op::print(OpAsmPrinter &p) {
- p << " ";
- p.printOptionalAttrDict(
- (*this)->getAttrs(),
- /*elideAttrs=*/{"blockSize", "firstScaleLane", "firstScaleByte"});
- p << " " << getSource();
- p << " scale(" << getScale() << ")";
- p << " blockSize(" << getBlockSize() << ")";
- p << " firstScaleLane(" << getFirstScaleLane() << ")";
- p << " firstScaleByte(" << getFirstScaleByte() << ")";
- p << " : " << getSource().getType() << " to " << getRes().getType();
-}
-
//===----------------------------------------------------------------------===//
// WMMAOp
//===----------------------------------------------------------------------===//
>From e71f8d8c85dbe354b0e7142d44de02becfb7c813 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 16 Oct 2025 10:51:36 -0400
Subject: [PATCH 3/4] Add note about availability on gfx1250+
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 6f9cf1825c5c2..05525d3a061de 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -157,6 +157,8 @@ def AMDGPU_ScaledExtPacked816Op
Note: the layout for the scales generally mirrors how the WMMA
instructions use for matix scales. These selection operands allows
one to choose portions of the matrix to convert.
+
+ Available on gfx1250+.
}];
let assemblyFormat = [{
>From 4f83cd9a8df19ac3ae4ce230c5b401d5f09b2911 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 16 Oct 2025 14:16:50 -0400
Subject: [PATCH 4/4] Add verifier for blockSize and firstScaleByte
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 ++
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 13 +++++++++++++
mlir/test/Dialect/AMDGPU/invalid.mlir | 8 ++++++++
3 files changed, 23 insertions(+)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 05525d3a061de..54464997931d7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -176,6 +176,8 @@ def AMDGPU_ScaledExtPacked816Op
}
}];
+ let hasVerifier = 1;
+
}
def AMDGPU_ScaledExtPackedOp
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index f405d0cc7aa02..06dbf7520c4fd 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -338,6 +338,19 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
context);
}
+//===----------------------------------------------------------------------===//
+// ScaledExtPacked816Op
+//===----------------------------------------------------------------------===//
+LogicalResult ScaledExtPacked816Op::verify() {
+ int blockSize = getBlockSize();
+ assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+ int firstScaleByte = getFirstScaleByte();
+ if (blockSize == 16 && firstScaleByte == 2) {
+ return emitOpError("blockSize of 16 cannot have firstScaleByte be 2.");
+ }
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// WMMAOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 66e7dd4014af9..41a5c8dd26676 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -238,3 +238,11 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>>
func.return
}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error at +1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 cannot have firstScaleByte be 2.}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2> to vector<8xf16>
+ func.return
+}
More information about the Mlir-commits
mailing list