[Mlir-commits] [mlir] [mlir][AMDGPU] Add scaled floating point conversion ops fp8 (PR #141554)
Tim Gymnich
llvmlistbot at llvm.org
Tue Jun 10 04:18:50 PDT 2025
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/141554
>From ed7a29671fa16fde6848ccb49b125e0fb1b3c870 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 27 May 2025 06:52:40 +0000
Subject: [PATCH 1/6] [mlir][AMDGPU] implement ScaledExtPackedOp and
PackedScaledTruncOp
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 62 ++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 181 +++++++++++++++++-
2 files changed, 242 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 02308568c1ad1..dfdb19b2f7e82 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,38 @@ def AMDGPU_ExtPackedFp8Op :
}];
}
+def AMDGPU_ScaledExtPackedOp
+ : AMDGPU_Op<"scaled_ext_packed", [Pure]>,
+ Arguments<(
+ ins AnyTypeOf<[VectorOfLengthAndType<[2, 3, 4], [F8E5M2, F8E4M3FN]>,
+ VectorOfLengthAndType<[2, 3, 4, 5, 6, 7, 8],
+ [F4E2M1FN]>]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[2], [F32]>,
+ FixedVectorOfLengthAndType<[2], [F16]>,
+ FixedVectorOfLengthAndType<[2], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale two packed floats in `source[index]` to two floats and
+ return them.
+
+ This rather unusual signature arises from the fact that AMD GPUs cannot
+ easily work with sub 32-bit quantities, so the compiler intrinsics for
+ extending 8-bit floats (which are, currently, the only way to work with
+ this operation) take packed vectors of 2 such floats.
+
+ If the passed-in vector has fewer than two elements, or the input is scalar,
+ the remaining values in the <2 x i8> will be filled with
+ undefined values as needed.
+ }];
+ let assemblyFormat = [{
+ attr-dict $source `[` $index `]` `,` $scale `:` type($source) `to` type($res)
+ }];
+}
+
def AMDGPU_PackedTrunc2xFp8Op :
AMDGPU_Op<"packed_trunc_2xfp8", [Pure, AttrSizedOperandSegments]>,
Arguments<(ins F32:$sourceA,
@@ -139,6 +171,36 @@ def AMDGPU_PackedTrunc2xFp8Op :
let hasVerifier = 1;
}
+def AMDGPU_PackedScaledTruncOp
+ : AMDGPU_Op<"packed_scaled_trunc", [Pure]>,
+ Arguments<(ins VectorOfLengthAndType<[2], [F32, F16, BF16]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index,
+ Optional<AnyTypeOf<
+ [FixedVectorOfLengthAndType<[4], [F8E5M2, F8E4M3FN]>,
+ FixedVectorOfLengthAndType<[8], [F4E2M1FN]>]>>:$existing)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[4], [F8E5M2, F8E4M3FN]>,
+ FixedVectorOfLengthAndType<[8], [F4E2M1FN]>]>:$res)> {
+ let summary = "Round two floats into a packed vector of floats";
+ let description = [{
+ Scale and round the inputs `sourceA` and `sourceB` (which is undefined if not
+ specified) into the low or high word (bottom two or top two) elements
+ of the returned vector, keeping the other two elements of `existing`
+ unchanged if present (or undefined if it was not passed in).
+
+ The reason for this odd signature is that AMD GPUs cannot easily work with
+ sub-registers, and so the conversion intrinsics take 32-bit wide
+ packed vectors of float values.
+ }];
+ let assemblyFormat = [{
+ attr-dict $source `into` ($existing^):(`undef`)? `[` `index` $index `]`
+ `,` $scale
+ `:` type($source) `to` type($res) (`into` type($existing)^)?
+ }];
+ let hasVerifier = 0;
+}
+
def AMDGPU_PackedStochRoundFp8Op :
AMDGPU_Op<"packed_stoch_round_fp8", [Pure]>,
Arguments<(ins F32:$source,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index c5094799bbef7..aed9b71c69cba 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -24,6 +24,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include <optional>
namespace mlir {
@@ -1174,6 +1175,32 @@ struct PackedStochRoundFp8OpLowering final
PackedStochRoundFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};
+
+struct ScaledExtPackedOpLowering final
+ : public ConvertOpToLLVMPattern<ScaledExtPackedOp> {
+ ScaledExtPackedOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<amdgpu::ScaledExtPackedOp>(converter),
+ chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
+struct PackedScaledTruncOpLowering final
+ : public ConvertOpToLLVMPattern<PackedScaledTruncOp> {
+ PackedScaledTruncOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<amdgpu::PackedScaledTruncOp>(converter),
+ chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(PackedScaledTruncOp op, PackedScaledTruncOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
} // end namespace
LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
@@ -1230,6 +1257,157 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
return success();
}
+LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
+ ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ Location loc = op.getLoc();
+ if (chipset != kGfx950)
+ return rewriter.notifyMatchFailure(
+ loc, "Scaled fp8 conversion instructions are not available on target "
+ "architecture and their emulation is not implemented");
+ Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
+
+ Value source = adaptor.getSource();
+ Value scale = adaptor.getScale();
+
+ VectorType sourceVecType = dyn_cast<VectorType>(op.getSource().getType());
+ Type sourceElemType = getElementTypeOrSelf(op.getSource());
+ VectorType destVecType = dyn_cast<VectorType>(op.getResult().getType());
+ Type destElemType = getElementTypeOrSelf(op.getResult());
+
+ VectorType packedVecType;
+ if (isa<Float8E5M2Type, Float8E4M3FNType>(sourceElemType)) {
+ VectorType v4i8 = VectorType::get(4, rewriter.getI8Type());
+ packedVecType = cast<VectorType>(getTypeConverter()->convertType(v4i8));
+ } else if (isa<Float4E2M1FNType>(sourceElemType)) {
+ VectorType v8i4 = VectorType::get(8, rewriter.getI4Type());
+ packedVecType = cast<VectorType>(getTypeConverter()->convertType(v8i4));
+ } else {
+ llvm_unreachable("invalid element type for scaled ext");
+ }
+
+ // Extend to a packedVectorType
+ if (!sourceVecType ||
+ sourceVecType.getNumElements() < packedVecType.getNumElements()) {
+ Value longVec = rewriter.create<LLVM::UndefOp>(loc, packedVecType);
+ if (!sourceVecType) {
+ longVec = rewriter.create<LLVM::InsertElementOp>(
+ loc, longVec, source, createI32Constant(rewriter, loc, 0));
+ } else {
+ for (int32_t i = 0, e = sourceVecType.getNumElements(); i < e; ++i) {
+ Value idx = createI32Constant(rewriter, loc, i);
+ Value elem = rewriter.create<LLVM::ExtractElementOp>(loc, source, idx);
+ longVec =
+ rewriter.create<LLVM::InsertElementOp>(loc, longVec, elem, idx);
+ }
+ }
+ source = longVec;
+ }
+ Value i32Source = rewriter.create<LLVM::BitcastOp>(loc, i32, source);
+
+ if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isF32())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Bf8Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isF16())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Bf8Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isBF16())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Bf8Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isF32())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Fp8Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isF16())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Fp8Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isBF16())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Fp8Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isF32())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Fp4Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isF16())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Fp4Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isBF16())
+ rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Fp4Op>(
+ op, destVecType, i32Source, scale, op.getIndex());
+ else
+ return failure();
+
+ return success();
+}
+
+LogicalResult PackedScaledTruncOpLowering::matchAndRewrite(
+ PackedScaledTruncOp op, PackedScaledTruncOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ Location loc = op.getLoc();
+ if (chipset != kGfx950)
+ return rewriter.notifyMatchFailure(
+ loc, "Scaled fp8 conversion instructions are not available on target "
+ "architecture and their emulation is not implemented");
+ Type v2i16 = getTypeConverter()->convertType(
+ VectorType::get(2, rewriter.getI16Type()));
+ Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
+
+ Type resultType = op.getResult().getType();
+ Type resultElemType = getElementTypeOrSelf(resultType);
+ Type sourceElemType = getElementTypeOrSelf(op.getSource());
+
+ Type intResultType = isa<Float4E2M1FNType>(resultElemType) ? i32 : v2i16;
+
+ Value source = adaptor.getSource();
+ Value scale = adaptor.getScale();
+ Value existing = adaptor.getExisting();
+ if (existing)
+ existing = rewriter.create<LLVM::BitcastOp>(loc, intResultType, existing);
+ else
+ existing = rewriter.create<LLVM::UndefOp>(loc, intResultType);
+
+ Value sourceA, sourceB;
+ if (sourceElemType.isF32()) {
+ Value c0 = createI32Constant(rewriter, loc, 0);
+ Value c1 = createI32Constant(rewriter, loc, 1);
+ sourceA = rewriter.create<LLVM::ExtractElementOp>(loc, source, c0);
+ sourceB = rewriter.create<LLVM::ExtractElementOp>(loc, source, c1);
+ }
+
+ Value result;
+ if (sourceElemType.isF32() && isa<Float8E5M2Type>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkBf8F32Op>(
+ loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+ else if (sourceElemType.isF16() && isa<Float8E5M2Type>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkBf8F16Op>(
+ loc, intResultType, existing, source, scale, op.getIndex());
+ else if (sourceElemType.isBF16() && isa<Float8E5M2Type>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkBf8Bf16Op>(
+ loc, intResultType, existing, source, scale, op.getIndex());
+ else if (sourceElemType.isF32() && isa<Float8E4M3FNType>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkFp8F32Op>(
+ loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+ else if (sourceElemType.isF16() && isa<Float8E4M3FNType>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkFp8F16Op>(
+ loc, intResultType, existing, source, scale, op.getIndex());
+ else if (sourceElemType.isBF16() && isa<Float8E4M3FNType>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkFp8Bf16Op>(
+ loc, intResultType, existing, source, scale, op.getIndex());
+ else if (sourceElemType.isF32() && isa<Float4E2M1FNType>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkFp4F32Op>(
+ loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+ else if (sourceElemType.isF16() && isa<Float4E2M1FNType>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkFp4F16Op>(
+ loc, intResultType, existing, source, scale, op.getIndex());
+ else if (sourceElemType.isBF16() && isa<Float4E2M1FNType>(resultElemType))
+ result = rewriter.create<ROCDL::CvtScaleF32PkFp4Bf16Op>(
+ loc, intResultType, existing, source, scale, op.getIndex());
+ else
+ return failure();
+
+ result = rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(
+ op, getTypeConverter()->convertType(resultType), result);
+ return success();
+}
+
LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
@@ -1547,7 +1725,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicCmpSwap>,
AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
- ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+ ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+ PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering>(converter,
chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
>From a91ac290f201c2eb52ab5ce006bb9c24f8fb9208 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 4 Jun 2025 18:14:25 +0200
Subject: [PATCH 2/6] update tests
---
.../Conversion/AMDGPUToROCDL/packed-ext.mlir | 375 ++++++++++++++++++
.../AMDGPUToROCDL/packed-trunc.mlir | 232 +++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 315 +++++++++++++++
3 files changed, 922 insertions(+)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
new file mode 100644
index 0000000000000..27a66e9f1bcbd
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
@@ -0,0 +1,375 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f32
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f8e4m3_f32(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f16
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f8e4m3_f16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_bf16
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f32
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f8e5m2_f32(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f16
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f8e5m2_f16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_bf16
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f32
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f16
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f4e2m1_f16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_bf16
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f4e2m1_bf16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f32
+// CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C2:%.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK: [[ELEM_2:%.+]] = llvm.extractelement [[V]]{{\[}}[[C2]] : i32] : vector<4xi4>
+// CHECK: [[VEC_2:%.+]] = llvm.insertelement [[ELEM_2]], [[VEC_1]]{{\[}}[[C2]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C3:%.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK: [[ELEM_3:%.+]] = llvm.extractelement [[V]]{{\[}}[[C3]] : i32] : vector<4xi4>
+// CHECK: [[VEC_3:%.+]] = llvm.insertelement [[ELEM_3]], [[VEC_2]]{{\[}}[[C3]] : i32] : vector<8xi4>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_3]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C2:%.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK: [[ELEM_2:%.+]] = llvm.extractelement [[V]]{{\[}}[[C2]] : i32] : vector<4xi4>
+// CHECK: [[VEC_2:%.+]] = llvm.insertelement [[ELEM_2]], [[VEC_1]]{{\[}}[[C2]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C3:%.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK: [[ELEM_3:%.+]] = llvm.extractelement [[V]]{{\[}}[[C3]] : i32] : vector<4xi4>
+// CHECK: [[VEC_3:%.+]] = llvm.insertelement [[ELEM_3]], [[VEC_2]]{{\[}}[[C3]] : i32] : vector<8xi4>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_3]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
+// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK: rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
new file mode 100644
index 0000000000000..b90f485457b2d
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
@@ -0,0 +1,232 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
+// CHECK-DAG: [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG: [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
+// CHECK-DAG: [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG: [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
+// CHECK-DAG: [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG: [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK: return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
+// CHECK-DAG: [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG: [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK: return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK: return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
+// CHECK-DAG: [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG: [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK: return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK: return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
+// CHECK-DAG: [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG: [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK: return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : i32
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[UNDEF]][0] : i32
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
+// CHECK-DAG: [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg1 -> [[UNDEF]][0] : i32
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
+// CHECK-DAG: [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
+// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg1 -> [[UNDEF]][0] : i32
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
+// CHECK-DAG: [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG: [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 188cfcc4eb38b..354ad19e41931 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -32,6 +32,321 @@ func.func @packed_stoch_round_fp8(%v1: f32, %stoch: i32, %others: vector<4xf8E5M
func.return %ret : vector<4xf8E5M2FNUZ>
}
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_f32(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_f16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_f32(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_f16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_f16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_bf16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf32>
+ func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf16>
+ func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+ %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xbf16>
+ func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ func.return %ret : vector<8xf4E2M1FN>
+}
+
// CHECK-LABEL: func @fat_raw_buffer_cast_easy
// CHECK: amdgpu.fat_raw_buffer_cast
func.func @fat_raw_buffer_cast_easy(%m: memref<8xi32>) -> memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> {
>From 61c95dbb3c9b284219389ae9e5b2f6c405795614 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 5 Jun 2025 16:05:54 +0200
Subject: [PATCH 3/6] add verifiers
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 9 +++++++++
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index dfdb19b2f7e82..b71afd3c96542 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -198,7 +198,7 @@ def AMDGPU_PackedScaledTruncOp
`,` $scale
`:` type($source) `to` type($res) (`into` type($existing)^)?
}];
- let hasVerifier = 0;
+ let hasVerifier = 1;
}
def AMDGPU_PackedStochRoundFp8Op :
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index a0a98a4e86721..0d0add3094666 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -60,6 +60,15 @@ LogicalResult PackedStochRoundFp8Op::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// mxfp float ops
+//===----------------------------------------------------------------------===//
+LogicalResult PackedScaledTruncOp::verify() {
+ if (getExisting() && getExisting().getType() != getResult().getType())
+ return emitOpError("existing values must have same type as result");
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// FatRawBufferCastOp
//===----------------------------------------------------------------------===//
>From 7e6b14599aa02b87708f3bb317b2368185421ba4 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 10 Jun 2025 11:15:08 +0000
Subject: [PATCH 4/6] replace undef with zero
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 +-
.../Conversion/AMDGPUToROCDL/packed-ext.mlir | 68 +++++++++----------
.../AMDGPUToROCDL/packed-trunc.mlir | 36 +++++-----
mlir/test/Dialect/AMDGPU/ops.mlir | 36 +++++-----
4 files changed, 72 insertions(+), 72 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index aed9b71c69cba..c13fc63684cb4 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1289,7 +1289,7 @@ LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
// Extend to a packedVectorType
if (!sourceVecType ||
sourceVecType.getNumElements() < packedVecType.getNumElements()) {
- Value longVec = rewriter.create<LLVM::UndefOp>(loc, packedVecType);
+ Value longVec = rewriter.create<LLVM::ZeroOp>(loc, packedVecType);
if (!sourceVecType) {
longVec = rewriter.create<LLVM::InsertElementOp>(
loc, longVec, source, createI32Constant(rewriter, loc, 0));
@@ -1362,7 +1362,7 @@ LogicalResult PackedScaledTruncOpLowering::matchAndRewrite(
if (existing)
existing = rewriter.create<LLVM::BitcastOp>(loc, intResultType, existing);
else
- existing = rewriter.create<LLVM::UndefOp>(loc, intResultType);
+ existing = rewriter.create<LLVM::ZeroOp>(loc, intResultType);
Value sourceA, sourceB;
if (sourceElemType.isF32()) {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
index 27a66e9f1bcbd..f0df35457babf 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
@@ -29,10 +29,10 @@ func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> v
// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -45,10 +45,10 @@ func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> ve
// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -61,10 +61,10 @@ func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> ve
// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -77,10 +77,10 @@ func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> v
// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -93,10 +93,10 @@ func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) ->
// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -109,10 +109,10 @@ func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) ->
// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -152,10 +152,10 @@ func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vec
// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -168,10 +168,10 @@ func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vect
// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -184,10 +184,10 @@ func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vect
// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -200,10 +200,10 @@ func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vec
// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -216,10 +216,10 @@ func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> ve
// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -232,10 +232,10 @@ func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> ve
// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<4xi8>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
@@ -284,10 +284,10 @@ func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> ve
// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
@@ -306,10 +306,10 @@ func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> ve
// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
@@ -328,10 +328,10 @@ func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> v
// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
@@ -344,10 +344,10 @@ func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) ->
// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
@@ -360,10 +360,10 @@ func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
-// CHECK-DAG: [[UNDEF_:%.+]] = llvm.mlir.undef : vector<8xi4>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
-// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[UNDEF_]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
// CHECK: [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
index b90f485457b2d..8bf32987c6154 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
@@ -1,12 +1,12 @@
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][false] : vector<2xi16>
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
@@ -32,8 +32,8 @@ func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vec
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
@@ -55,8 +55,8 @@ func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vec
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
@@ -78,12 +78,12 @@ func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: v
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][false] : vector<2xi16>
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
@@ -109,8 +109,8 @@ func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vec
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
@@ -132,8 +132,8 @@ func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vec
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : vector<2xi16>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg1 -> [[UNDEF]][false] : vector<2xi16>
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
@@ -155,12 +155,12 @@ func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: v
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : i32
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : i32
// CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK-DAG: [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
// CHECK: [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[UNDEF]][0] : i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][0] : i32
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
@@ -186,8 +186,8 @@ func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vec
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : i32
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg1 -> [[UNDEF]][0] : i32
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg1 -> [[ZERO]][0] : i32
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
@@ -209,8 +209,8 @@ func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vec
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
-// CHECK-DAG: [[UNDEF:%.+]] = llvm.mlir.undef : i32
-// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg1 -> [[UNDEF]][0] : i32
+// CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK: [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg1 -> [[ZERO]][0] : i32
// CHECK: [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 354ad19e41931..6c3ffb575f7c2 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -224,126 +224,126 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
>From 986f72b4db6f9d9145ddce41d4182e4e806385b2 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 10 Jun 2025 11:15:54 +0000
Subject: [PATCH 5/6] remove index from asm
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
.../AMDGPUToROCDL/packed-trunc.mlir | 36 +++++++++----------
2 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index b71afd3c96542..cb64e7c163d2c 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -194,7 +194,7 @@ def AMDGPU_PackedScaledTruncOp
packed vectors of float values.
}];
let assemblyFormat = [{
- attr-dict $source `into` ($existing^):(`undef`)? `[` `index` $index `]`
+ attr-dict $source `into` ($existing^):(`undef`)? `[` $index `]`
`,` $scale
`:` type($source) `to` type($res) (`into` type($existing)^)?
}];
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
index 8bf32987c6154..30924474f44c1 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
@@ -11,7 +11,7 @@
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
@@ -27,7 +27,7 @@ func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
@@ -38,7 +38,7 @@ func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
@@ -50,7 +50,7 @@ func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
@@ -61,7 +61,7 @@ func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
@@ -73,7 +73,7 @@ func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> v
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
// CHECK: return [[CAST]] : vector<4xf8E4M3FN>
func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
func.return %ret : vector<4xf8E4M3FN>
}
@@ -88,7 +88,7 @@ func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: v
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
@@ -104,7 +104,7 @@ func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
@@ -115,7 +115,7 @@ func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
@@ -127,7 +127,7 @@ func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
@@ -138,7 +138,7 @@ func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
@@ -150,7 +150,7 @@ func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> v
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
// CHECK: return [[CAST]] : vector<4xf8E5M2>
func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
func.return %ret : vector<4xf8E5M2>
}
@@ -165,7 +165,7 @@ func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: v
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
@@ -181,7 +181,7 @@ func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
@@ -192,7 +192,7 @@ func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
@@ -204,7 +204,7 @@ func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
@@ -215,7 +215,7 @@ func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vec
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into undef[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
@@ -227,6 +227,6 @@ func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> v
// CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
// CHECK: return [[CAST]] : vector<8xf4E2M1FN>
func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
- %ret = amdgpu.packed_scaled_trunc %v into %existing[index 0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+ %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
func.return %ret : vector<8xf4E2M1FN>
}
>From b6023472479b47c543e9090ef8360894c8e66b43 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 10 Jun 2025 11:16:24 +0000
Subject: [PATCH 6/6] fix error msg
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index c13fc63684cb4..9e347d1d1cda6 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1263,7 +1263,7 @@ LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
Location loc = op.getLoc();
if (chipset != kGfx950)
return rewriter.notifyMatchFailure(
- loc, "Scaled fp8 conversion instructions are not available on target "
+ loc, "Scaled fp conversion instructions are not available on target "
"architecture and their emulation is not implemented");
Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
@@ -1344,7 +1344,7 @@ LogicalResult PackedScaledTruncOpLowering::matchAndRewrite(
Location loc = op.getLoc();
if (chipset != kGfx950)
return rewriter.notifyMatchFailure(
- loc, "Scaled fp8 conversion instructions are not available on target "
+ loc, "Scaled fp conversion instructions are not available on target "
"architecture and their emulation is not implemented");
Type v2i16 = getTypeConverter()->convertType(
VectorType::get(2, rewriter.getI16Type()));
More information about the Mlir-commits
mailing list