[Mlir-commits] [mlir] [mlir][amdgpu] Add conversion from arith.scaling_extf to amdgpu (PR #146372)

Tim Gymnich llvmlistbot at llvm.org
Tue Jul 8 07:26:15 PDT 2025


https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/146372

>From 818aa667b30959713cc582da1ab54f51689deacc Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 26 Jun 2025 09:54:55 +0000
Subject: [PATCH 01/17] [mlir][amdgpu] Add conversion for arith.scaling_extf to
 amdgpu

---
 .../ArithToAMDGPU/ArithToAMDGPU.cpp           | 272 ++++++++++++++++++
 1 file changed, 272 insertions(+)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 3596b3235a631..2178499031e14 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -14,12 +14,18 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/LogicalResult.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_ARITHTOAMDGPUCONVERSIONPASS
@@ -32,6 +38,7 @@ using namespace mlir::amdgpu;
 namespace {
 // Define commonly used chipsets versions for convenience.
 constexpr Chipset kGfx942 = Chipset(9, 4, 2);
+constexpr Chipset kGfx950 = Chipset(9, 5, 0);
 
 struct ArithToAMDGPUConversionPass final
     : impl::ArithToAMDGPUConversionPassBase<ArithToAMDGPUConversionPass> {
@@ -73,6 +80,28 @@ struct TruncfToFloat16RewritePattern final
                                 PatternRewriter &rewriter) const override;
 };
 
+struct ScalingExtFRewritePattern final
+    : OpRewritePattern<arith::ScalingExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  ScalingExtFRewritePattern(MLIRContext *ctx)
+      : OpRewritePattern::OpRewritePattern(ctx) {}
+
+  LogicalResult matchAndRewrite(arith::ScalingExtFOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
+struct ScalingTruncFRewritePattern final
+    : OpRewritePattern<arith::ScalingTruncFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  ScalingTruncFRewritePattern(MLIRContext *ctx)
+      : OpRewritePattern::OpRewritePattern(ctx) {}
+
+  LogicalResult matchAndRewrite(arith::ScalingTruncFOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
 } // end namespace
 
 static bool isSupportedF8(Type elementType, Chipset chipset) {
@@ -395,6 +424,244 @@ LogicalResult TruncfToFloat16RewritePattern::matchAndRewrite(
   return success();
 }
 
+static Value getOriginalVectorValue(Value value) {
+  Value current = value;
+  while (Operation *definingOp = current.getDefiningOp()) {
+    bool skipOp = llvm::TypeSwitch<Operation *, bool>(definingOp)
+                      .Case<vector::ShapeCastOp>([&current](auto op) {
+                        current = op.getSource();
+                        return true;
+                      })
+                      .Case<vector::BroadcastOp>([&current](auto op) {
+                        current = op.getSource();
+                        return false;
+                      })
+                      .Case<vector::SplatOp>([&current](auto op) {
+                        current = op.getInput();
+                        return false;
+                      })
+                      .Default([](Operation *) { return false; });
+
+    if (!skipOp) {
+      break;
+    }
+  }
+  return current;
+}
+
+LogicalResult
+ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
+                                           PatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  constexpr const int64_t opWidth = 2;
+
+  Value in = op.getIn();
+  Value scale = op.getScale();
+  Value out = op.getOut();
+
+  Type f32 = rewriter.getF32Type();
+  Type inType = getElementTypeOrSelf(in);
+  Type scaleType = getElementTypeOrSelf(scale);
+  Type outType = getElementTypeOrSelf(out);
+  VectorType scaleVecType = dyn_cast<VectorType>(scale.getType());
+  VectorType inVecType = dyn_cast<VectorType>(in.getType());
+  VectorType outVecType = dyn_cast<VectorType>(out.getType());
+
+  if (outVecType && outVecType.isScalable())
+    return failure();
+
+  Type scaleF32Type =
+      scaleVecType ? VectorType::get(scaleVecType.getShape(), f32) : f32;
+  if (scaleType.getIntOrFloatBitWidth() < 32)
+    scale = rewriter.create<arith::ExtFOp>(loc, scaleF32Type, scale);
+  else if (scaleType.getIntOrFloatBitWidth() > 32)
+    scale = rewriter.create<arith::TruncFOp>(loc, scaleF32Type, scale);
+
+  VectorType extScaleResultType = VectorType::get(opWidth, outType);
+
+  if (!outVecType) {
+    Value inCast =
+        rewriter.create<vector::SplatOp>(loc, VectorType::get(1, inType), in);
+    Value scaleExt = rewriter.create<amdgpu::ScaledExtPackedOp>(
+        loc, extScaleResultType, inCast, scale, 0);
+    scaleExt = rewriter.replaceOpWithNewOp<vector::ExtractOp>(op, scaleExt, 0);
+    return success();
+  }
+
+  Value origScale = getOriginalVectorValue(scale);
+  Type origScaleType = origScale.getType();
+  VectorType origScaleVecType = isa<VectorType>(origScaleType)
+                                    ? cast<VectorType>(origScaleType)
+                                    : VectorType::get(1, origScaleType);
+
+  ArrayRef<int64_t> originalScaleShape = origScaleVecType.getShape();
+  ArrayRef<int64_t> inShape = inVecType.getShape();
+
+  SmallVector<int64_t> paddedScaleShape(originalScaleShape);
+  paddedScaleShape.insert(paddedScaleShape.end(),
+                          inShape.size() - originalScaleShape.size(), 1);
+
+  auto ratio = computeShapeRatio(inShape, paddedScaleShape);
+  if (!ratio)
+    return failure();
+
+  const int64_t blockSize = computeProduct(*ratio);
+
+  Value zero = rewriter.create<arith::ConstantOp>(
+      loc, outType, rewriter.getFloatAttr(outType, 0.0));
+  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
+
+  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, *ratio)) {
+    SmallVector<int64_t> strides(offsets.size(), 1);
+    Value block = rewriter.create<vector::ExtractStridedSliceOp>(
+        loc, in, offsets, *ratio, strides);
+    VectorType block1DType = VectorType::get(blockSize, inType);
+    Value block1D =
+        rewriter.create<vector::ShapeCastOp>(loc, block1DType, block);
+    Value uniformScale =
+        rewriter.create<vector::ExtractOp>(loc, scale, offsets);
+
+    VectorType blockResultType = VectorType::get(blockSize, outType);
+    Value blockResult =
+        rewriter.createOrFold<vector::SplatOp>(loc, blockResultType, zero);
+
+    for (int64_t i = 0, sliceWidth = opWidth - blockSize % opWidth;
+         i < blockSize;
+         i += sliceWidth, sliceWidth = opWidth - blockSize % opWidth) {
+      Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
+          loc, block1D, i, sliceWidth, 1);
+      Value scaleExt = rewriter.create<amdgpu::ScaledExtPackedOp>(
+          loc, extScaleResultType, slice, uniformScale, 0);
+      if (sliceWidth != opWidth)
+        scaleExt = rewriter.create<vector::ExtractStridedSliceOp>(
+            loc, scaleExt, 0, sliceWidth, 1);
+      blockResult = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, scaleExt, blockResult, i, 1);
+    }
+
+    VectorType resultType = VectorType::get(*ratio, outType);
+    Value cast =
+        rewriter.create<vector::ShapeCastOp>(loc, resultType, blockResult);
+    result = rewriter.create<vector::InsertStridedSliceOp>(loc, cast, result,
+                                                           offsets, strides);
+  }
+
+  rewriter.replaceOp(op, result);
+
+  return success();
+}
+
+LogicalResult
+ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
+                                             PatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  constexpr const int64_t opWidth = 2;
+
+  Value in = op.getIn();
+  Value scale = op.getScale();
+  Value out = op.getOut();
+
+  Type f32 = rewriter.getF32Type();
+  Type inType = getElementTypeOrSelf(in);
+  Type scaleType = getElementTypeOrSelf(scale);
+  Type outType = getElementTypeOrSelf(out);
+  VectorType scaleVecType = dyn_cast<VectorType>(scale.getType());
+  VectorType inVecType = dyn_cast<VectorType>(in.getType());
+  VectorType outVecType = dyn_cast<VectorType>(out.getType());
+
+  if (outVecType && outVecType.isScalable())
+    return failure();
+
+  Type scaleF32Type =
+      scaleVecType ? VectorType::get(scaleVecType.getShape(), f32) : f32;
+  if (scaleType.getIntOrFloatBitWidth() < 32)
+    scale = rewriter.create<arith::ExtFOp>(loc, scaleF32Type, scale);
+  else if (scaleType.getIntOrFloatBitWidth() > 32)
+    scale = rewriter.create<arith::TruncFOp>(loc, scaleF32Type, scale);
+
+  Value zero = rewriter.create<arith::ConstantOp>(
+      loc, outType, rewriter.getFloatAttr(outType, 0.0));
+  unsigned numPackedElem = 32 / outType.getIntOrFloatBitWidth();
+  VectorType truncScaleResultType = VectorType::get(numPackedElem, outType);
+
+  if (!outVecType) {
+    Type inVecType = VectorType::get(1, inType);
+    // Type exisingVecType = VectorType::get(opWidth, outType);
+    Value inCast = rewriter.create<vector::SplatOp>(loc, inVecType, in);
+    // Value existing =
+    //     rewriter.createOrFold<vector::SplatOp>(loc, exisingVecType, zero);
+    Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
+        loc, truncScaleResultType, inCast, scale, 0, /*existing=*/nullptr);
+    scaleTrunc =
+        rewriter.replaceOpWithNewOp<vector::ExtractOp>(op, scaleTrunc, 0);
+    return success();
+  }
+
+  Value origScale = getOriginalVectorValue(scale);
+  Type origScaleType = origScale.getType();
+  VectorType origScaleVecType = isa<VectorType>(origScaleType)
+                                    ? cast<VectorType>(origScaleType)
+                                    : VectorType::get(1, origScaleType);
+
+  ArrayRef<int64_t> originalScaleShape = origScaleVecType.getShape();
+  ArrayRef<int64_t> inShape = inVecType.getShape();
+
+  SmallVector<int64_t> paddedScaleShape(originalScaleShape);
+  paddedScaleShape.insert(paddedScaleShape.end(),
+                          inShape.size() - originalScaleShape.size(), 1);
+
+  auto ratio = computeShapeRatio(inShape, paddedScaleShape);
+  if (!ratio)
+    return failure();
+
+  const int64_t blockSize = computeProduct(*ratio);
+
+  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
+
+  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, *ratio)) {
+    SmallVector<int64_t> strides(offsets.size(), 1);
+    Value block = rewriter.create<vector::ExtractStridedSliceOp>(
+        loc, in, offsets, *ratio, strides);
+    VectorType block1DType = VectorType::get(blockSize, inType);
+    Value block1D =
+        rewriter.create<vector::ShapeCastOp>(loc, block1DType, block);
+    Value uniformScale =
+        rewriter.create<vector::ExtractOp>(loc, scale, offsets);
+
+    VectorType blockResultType = VectorType::get(blockSize, outType);
+    Value blockResult =
+        rewriter.createOrFold<vector::SplatOp>(loc, blockResultType, zero);
+
+    for (int64_t i = 0, sliceWidth = opWidth - blockSize % opWidth;
+         i < blockSize;
+         i += sliceWidth, sliceWidth = opWidth - blockSize % opWidth) {
+      Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
+          loc, block1D, i, sliceWidth, 1);
+      // VectorType exisingVecType = VectorType::get(opWidth, outType);
+      // Value existing =
+      //     rewriter.createOrFold<vector::SplatOp>(loc, exisingVecType, zero);
+      Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
+          loc, truncScaleResultType, slice, uniformScale, 0,
+          /*existing=*/nullptr);
+      if (sliceWidth != opWidth)
+        scaleTrunc = rewriter.create<vector::ExtractStridedSliceOp>(
+            loc, scaleTrunc, 0, sliceWidth, 1);
+      blockResult = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, scaleTrunc, blockResult, i, 1);
+    }
+
+    VectorType resultType = VectorType::get(*ratio, outType);
+    Value cast =
+        rewriter.create<vector::ShapeCastOp>(loc, resultType, blockResult);
+    result = rewriter.create<vector::InsertStridedSliceOp>(loc, cast, result,
+                                                           offsets, strides);
+  }
+
+  rewriter.replaceOp(op, result);
+
+  return success();
+}
+
 void mlir::arith::populateArithToAMDGPUConversionPatterns(
     RewritePatternSet &patterns, bool convertFP8Arithmetic,
     bool saturateFP8Truncf, bool allowPackedF16Rtz, Chipset chipset) {
@@ -406,6 +673,11 @@ void mlir::arith::populateArithToAMDGPUConversionPatterns(
   }
   if (allowPackedF16Rtz)
     patterns.add<TruncfToFloat16RewritePattern>(patterns.getContext());
+
+  if (chipset >= kGfx950) {
+    patterns.add<ScalingExtFRewritePattern>(patterns.getContext());
+    patterns.add<ScalingTruncFRewritePattern>(patterns.getContext());
+  }
 }
 
 void ArithToAMDGPUConversionPass::runOnOperation() {

>From 4da246f5244af2703cd29b969169c547f01d9087 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Mon, 30 Jun 2025 12:18:36 +0000
Subject: [PATCH 02/17] add tests

---
 .../ArithToAMDGPU/scaling-extf.mlir           | 635 ++++++++++++++++++
 .../ArithToAMDGPU/scaling-truncf.mlir         | 557 +++++++++++++++
 2 files changed, 1192 insertions(+)
 create mode 100644 mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
 create mode 100644 mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir

diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
new file mode 100644
index 0000000000000..47266f55f9cf3
--- /dev/null
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
@@ -0,0 +1,635 @@
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx950" | FileCheck %s
+
+// CHECK-LABEL: @conversion_f8_f32_fallback
+// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
+// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf32>
+func.func @conversion_f8_f32_fallback(%in: vector<2x2xf8E5M2>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf32> {
+    %ext = arith.scaling_extf %in, %scale : vector<2x2xf8E5M2>, vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+    return %ext : vector<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_f4_f32_fallback
+// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
+// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf32>
+func.func @conversion_f4_f32_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf32> {
+    %ext = arith.scaling_extf %in, %scale : vector<2x2xf4E2M1FN>, vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+    return %ext : vector<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_f8_f16_fallback
+// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf16>
+func.func @conversion_f8_f16_fallback(%in: vector<2x2xf8E5M2>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf16> {
+    %ext = arith.scaling_extf %in, %scale : vector<2x2xf8E5M2>, vector<2x2xf8E8M0FNU> to vector<2x2xf16>
+    return %ext : vector<2x2xf16>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_f4_f16_fallback
+// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf16>
+func.func @conversion_f4_f16_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf16> {
+    %ext = arith.scaling_extf %in, %scale : vector<2x2xf4E2M1FN>, vector<2x2xf8E8M0FNU> to vector<2x2xf16>
+    return %ext : vector<2x2xf16>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_broadcast
+// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf32>
+// CHECK-NEXT:    [[BCAST:%.+]] = vector.broadcast %arg1 : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
+// CHECK-NEXT:    [[IN_CAST:%.+]] = vector.shape_cast %arg0 : vector<8x8xf8E5M2> to vector<8x2x4xf8E5M2>
+// CHECK-NEXT:    [[SCALE_CAST:%.+]] = vector.shape_cast [[BCAST]] : vector<4x8x2xf8E8M0FNU> to vector<8x2x4xf8E8M0FNU>
+// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf [[SCALE_CAST]] : vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_0:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_0:%.+]] = vector.shape_cast [[IN_SLICE_0]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_0:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_0:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_0]][0], [[SCALE_SCALAR_0]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_0:%.+]] = vector.extract_strided_slice [[PACKED_0]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_0:%.+]] = vector.shape_cast [[OUT_SLICE_0]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_0]], [[CST]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_1:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_1:%.+]] = vector.shape_cast [[IN_SLICE_1]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_1:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_1:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_1]][0], [[SCALE_SCALAR_1]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_1:%.+]] = vector.extract_strided_slice [[PACKED_1]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_1:%.+]] = vector.shape_cast [[OUT_SLICE_1]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_1]], [[ACC_A]] {offsets = [0, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_2:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_2:%.+]] = vector.shape_cast [[IN_SLICE_2]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_2:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_2:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_2]][0], [[SCALE_SCALAR_2]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_2:%.+]] = vector.extract_strided_slice [[PACKED_2]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_2:%.+]] = vector.shape_cast [[OUT_SLICE_2]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_2]], [[ACC_B]] {offsets = [0, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_3:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_3:%.+]] = vector.shape_cast [[IN_SLICE_3]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_3:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_3:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_3]][0], [[SCALE_SCALAR_3]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_3:%.+]] = vector.extract_strided_slice [[PACKED_3]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_3:%.+]] = vector.shape_cast [[OUT_SLICE_3]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_3]], [[ACC_A]] {offsets = [0, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_4:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_4:%.+]] = vector.shape_cast [[IN_SLICE_4]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_4:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_4:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_4]][0], [[SCALE_SCALAR_4]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_4:%.+]] = vector.extract_strided_slice [[PACKED_4]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_4:%.+]] = vector.shape_cast [[OUT_SLICE_4]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_4]], [[ACC_B]] {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_5:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_5:%.+]] = vector.shape_cast [[IN_SLICE_5]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_5:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_5:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_5]][0], [[SCALE_SCALAR_5]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_5:%.+]] = vector.extract_strided_slice [[PACKED_5]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_5:%.+]] = vector.shape_cast [[OUT_SLICE_5]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_5]], [[ACC_A]] {offsets = [0, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_6:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_6:%.+]] = vector.shape_cast [[IN_SLICE_6]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_6:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_6:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_6]][0], [[SCALE_SCALAR_6]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_6:%.+]] = vector.extract_strided_slice [[PACKED_6]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_6:%.+]] = vector.shape_cast [[OUT_SLICE_6]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_6]], [[ACC_B]] {offsets = [0, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_7:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_7:%.+]] = vector.shape_cast [[IN_SLICE_7]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_7:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_7:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_7]][0], [[SCALE_SCALAR_7]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_7:%.+]] = vector.extract_strided_slice [[PACKED_7]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_7:%.+]] = vector.shape_cast [[OUT_SLICE_7]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_7]], [[ACC_A]] {offsets = [0, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_8:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_8:%.+]] = vector.shape_cast [[IN_SLICE_8]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_8:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_8:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_8]][0], [[SCALE_SCALAR_8]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_8:%.+]] = vector.extract_strided_slice [[PACKED_8]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_8:%.+]] = vector.shape_cast [[OUT_SLICE_8]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_8]], [[ACC_B]] {offsets = [1, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_9:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_9:%.+]] = vector.shape_cast [[IN_SLICE_9]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_9:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_9:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_9]][0], [[SCALE_SCALAR_9]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_9:%.+]] = vector.extract_strided_slice [[PACKED_9]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_9:%.+]] = vector.shape_cast [[OUT_SLICE_9]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_9]], [[ACC_A]] {offsets = [1, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_12:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_12:%.+]] = vector.shape_cast [[IN_SLICE_12]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_12:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_12:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_12]][0], [[SCALE_SCALAR_12]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_12:%.+]] = vector.extract_strided_slice [[PACKED_12]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_12:%.+]] = vector.shape_cast [[OUT_SLICE_12]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_12]], [[ACC_B]] {offsets = [1, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_13:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_13:%.+]] = vector.shape_cast [[IN_SLICE_13]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_13:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_13:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_13]][0], [[SCALE_SCALAR_13]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_13:%.+]] = vector.extract_strided_slice [[PACKED_13]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_13:%.+]] = vector.shape_cast [[OUT_SLICE_13]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_13]], [[ACC_A]] {offsets = [1, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_14:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_14:%.+]] = vector.shape_cast [[IN_SLICE_14]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_14:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_14:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_14]][0], [[SCALE_SCALAR_14]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_14:%.+]] = vector.extract_strided_slice [[PACKED_14]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_14:%.+]] = vector.shape_cast [[OUT_SLICE_14]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_14]], [[ACC_B]] {offsets = [1, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_15:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_15:%.+]] = vector.shape_cast [[IN_SLICE_15]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_15:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_15:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_15]][0], [[SCALE_SCALAR_15]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_15:%.+]] = vector.extract_strided_slice [[PACKED_15]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_15:%.+]] = vector.shape_cast [[OUT_SLICE_15]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_15]], [[ACC_A]] {offsets = [1, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_16:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_16:%.+]] = vector.shape_cast [[IN_SLICE_16]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_16:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_16:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_16]][0], [[SCALE_SCALAR_16]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_16:%.+]] = vector.extract_strided_slice [[PACKED_16]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_16:%.+]] = vector.shape_cast [[OUT_SLICE_16]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_16]], [[ACC_B]] {offsets = [2, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_17:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_17:%.+]] = vector.shape_cast [[IN_SLICE_17]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_17:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_17:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_17]][0], [[SCALE_SCALAR_17]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_17:%.+]] = vector.extract_strided_slice [[PACKED_17]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_17:%.+]] = vector.shape_cast [[OUT_SLICE_17]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_17]], [[ACC_A]] {offsets = [2, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_18:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_18:%.+]] = vector.shape_cast [[IN_SLICE_18]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_18:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_18:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_18]][0], [[SCALE_SCALAR_18]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_18:%.+]] = vector.extract_strided_slice [[PACKED_18]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_18:%.+]] = vector.shape_cast [[OUT_SLICE_18]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_18]], [[ACC_B]] {offsets = [2, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_19:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_19:%.+]] = vector.shape_cast [[IN_SLICE_19]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_19:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_19:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_19]][0], [[SCALE_SCALAR_19]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_19:%.+]] = vector.extract_strided_slice [[PACKED_19]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_19:%.+]] = vector.shape_cast [[OUT_SLICE_19]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_19]], [[ACC_A]] {offsets = [2, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_20:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_20:%.+]] = vector.shape_cast [[IN_SLICE_20]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_20:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_20:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_20]][0], [[SCALE_SCALAR_20]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_20:%.+]] = vector.extract_strided_slice [[PACKED_20]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_20:%.+]] = vector.shape_cast [[OUT_SLICE_20]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_20]], [[ACC_B]] {offsets = [2, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_21:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_21:%.+]] = vector.shape_cast [[IN_SLICE_21]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_21:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_21:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_21]][0], [[SCALE_SCALAR_21]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_21:%.+]] = vector.extract_strided_slice [[PACKED_21]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_21:%.+]] = vector.shape_cast [[OUT_SLICE_21]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_21]], [[ACC_A]] {offsets = [2, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_22:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_22:%.+]] = vector.shape_cast [[IN_SLICE_22]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_22:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_22:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_22]][0], [[SCALE_SCALAR_22]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_22:%.+]] = vector.extract_strided_slice [[PACKED_22]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_22:%.+]] = vector.shape_cast [[OUT_SLICE_22]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_22]], [[ACC_B]] {offsets = [2, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_23:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_23:%.+]] = vector.shape_cast [[IN_SLICE_23]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_23:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_23:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_23]][0], [[SCALE_SCALAR_23]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_23:%.+]] = vector.extract_strided_slice [[PACKED_23]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_23:%.+]] = vector.shape_cast [[OUT_SLICE_23]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_23]], [[ACC_A]] {offsets = [2, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_24:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_24:%.+]] = vector.shape_cast [[IN_SLICE_24]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_24:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_24:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_24]][0], [[SCALE_SCALAR_24]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_24:%.+]] = vector.extract_strided_slice [[PACKED_24]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_24:%.+]] = vector.shape_cast [[OUT_SLICE_24]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_24]], [[ACC_B]] {offsets = [3, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_25:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_25:%.+]] = vector.shape_cast [[IN_SLICE_25]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_25:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_25:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_25]][0], [[SCALE_SCALAR_25]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_25:%.+]] = vector.extract_strided_slice [[PACKED_25]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_25:%.+]] = vector.shape_cast [[OUT_SLICE_25]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_25]], [[ACC_A]] {offsets = [3, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_26:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_26:%.+]] = vector.shape_cast [[IN_SLICE_26]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_26:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_26:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_26]][0], [[SCALE_SCALAR_26]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_26:%.+]] = vector.extract_strided_slice [[PACKED_26]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_26:%.+]] = vector.shape_cast [[OUT_SLICE_26]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_26]], [[ACC_B]] {offsets = [3, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_27:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_27:%.+]] = vector.shape_cast [[IN_SLICE_27]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_27:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_27:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_27]][0], [[SCALE_SCALAR_27]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_27:%.+]] = vector.extract_strided_slice [[PACKED_27]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_27:%.+]] = vector.shape_cast [[OUT_SLICE_27]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_27]], [[ACC_A]] {offsets = [3, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_28:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_28:%.+]] = vector.shape_cast [[IN_SLICE_28]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_28:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_28:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_28]][0], [[SCALE_SCALAR_28]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_28:%.+]] = vector.extract_strided_slice [[PACKED_28]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_28:%.+]] = vector.shape_cast [[OUT_SLICE_28]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_28]], [[ACC_B]] {offsets = [3, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_29:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_29:%.+]] = vector.shape_cast [[IN_SLICE_29]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_29:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_29:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_29]][0], [[SCALE_SCALAR_29]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_29:%.+]] = vector.extract_strided_slice [[PACKED_29]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_29:%.+]] = vector.shape_cast [[OUT_SLICE_29]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_29]], [[ACC_A]] {offsets = [3, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_30:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_30:%.+]] = vector.shape_cast [[IN_SLICE_30]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_30:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_30:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_30]][0], [[SCALE_SCALAR_30]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_30:%.+]] = vector.extract_strided_slice [[PACKED_30]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_30:%.+]] = vector.shape_cast [[OUT_SLICE_30]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_30]], [[ACC_B]] {offsets = [3, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_31:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_31:%.+]] = vector.shape_cast [[IN_SLICE_31]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_31:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_31:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_31]][0], [[SCALE_SCALAR_31]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_31:%.+]] = vector.extract_strided_slice [[PACKED_31]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_31:%.+]] = vector.shape_cast [[OUT_SLICE_31]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_31]], [[ACC_A]] {offsets = [3, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_32:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_32:%.+]] = vector.shape_cast [[IN_SLICE_32]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_32:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_32:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_32]][0], [[SCALE_SCALAR_32]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_32:%.+]] = vector.extract_strided_slice [[PACKED_32]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_32:%.+]] = vector.shape_cast [[OUT_SLICE_32]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_32]], [[ACC_B]] {offsets = [4, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_33:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_33:%.+]] = vector.shape_cast [[IN_SLICE_33]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_33:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_33:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_33]][0], [[SCALE_SCALAR_33]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_33:%.+]] = vector.extract_strided_slice [[PACKED_33]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_33:%.+]] = vector.shape_cast [[OUT_SLICE_33]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_33]], [[ACC_A]] {offsets = [4, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_34:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_34:%.+]] = vector.shape_cast [[IN_SLICE_34]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_34:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_34:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_34]][0], [[SCALE_SCALAR_34]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_34:%.+]] = vector.extract_strided_slice [[PACKED_34]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_34:%.+]] = vector.shape_cast [[OUT_SLICE_34]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_34]], [[ACC_B]] {offsets = [4, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_35:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_35:%.+]] = vector.shape_cast [[IN_SLICE_35]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_35:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_35:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_35]][0], [[SCALE_SCALAR_35]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_35:%.+]] = vector.extract_strided_slice [[PACKED_35]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_35:%.+]] = vector.shape_cast [[OUT_SLICE_35]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_35]], [[ACC_A]] {offsets = [4, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_36:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_36:%.+]] = vector.shape_cast [[IN_SLICE_36]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_36:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_36:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_36]][0], [[SCALE_SCALAR_36]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_36:%.+]] = vector.extract_strided_slice [[PACKED_36]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_36:%.+]] = vector.shape_cast [[OUT_SLICE_36]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_36]], [[ACC_B]] {offsets = [4, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_37:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_37:%.+]] = vector.shape_cast [[IN_SLICE_37]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_37:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_37:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_37]][0], [[SCALE_SCALAR_37]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_37:%.+]] = vector.extract_strided_slice [[PACKED_37]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_37:%.+]] = vector.shape_cast [[OUT_SLICE_37]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_37]], [[ACC_A]] {offsets = [4, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_38:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_38:%.+]] = vector.shape_cast [[IN_SLICE_38]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_38:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_38:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_38]][0], [[SCALE_SCALAR_38]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_38:%.+]] = vector.extract_strided_slice [[PACKED_38]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_38:%.+]] = vector.shape_cast [[OUT_SLICE_38]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_38]], [[ACC_B]] {offsets = [4, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_39:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_39:%.+]] = vector.shape_cast [[IN_SLICE_39]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_39:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_39:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_39]][0], [[SCALE_SCALAR_39]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_39:%.+]] = vector.extract_strided_slice [[PACKED_39]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_39:%.+]] = vector.shape_cast [[OUT_SLICE_39]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_39]], [[ACC_A]] {offsets = [4, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_40:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_40:%.+]] = vector.shape_cast [[IN_SLICE_40]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_40:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_40:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_40]][0], [[SCALE_SCALAR_40]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_40:%.+]] = vector.extract_strided_slice [[PACKED_40]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_40:%.+]] = vector.shape_cast [[OUT_SLICE_40]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_40]], [[ACC_B]] {offsets = [5, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_41:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_41:%.+]] = vector.shape_cast [[IN_SLICE_41]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_41:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_41:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_41]][0], [[SCALE_SCALAR_41]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_41:%.+]] = vector.extract_strided_slice [[PACKED_41]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_41:%.+]] = vector.shape_cast [[OUT_SLICE_41]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_41]], [[ACC_A]] {offsets = [5, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_42:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_42:%.+]] = vector.shape_cast [[IN_SLICE_42]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_42:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_42:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_42]][0], [[SCALE_SCALAR_42]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_42:%.+]] = vector.extract_strided_slice [[PACKED_42]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_42:%.+]] = vector.shape_cast [[OUT_SLICE_42]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_42]], [[ACC_B]] {offsets = [5, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_43:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_43:%.+]] = vector.shape_cast [[IN_SLICE_43]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_43:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_43:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_43]][0], [[SCALE_SCALAR_43]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_43:%.+]] = vector.extract_strided_slice [[PACKED_43]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_43:%.+]] = vector.shape_cast [[OUT_SLICE_43]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_43]], [[ACC_A]] {offsets = [5, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_44:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_44:%.+]] = vector.shape_cast [[IN_SLICE_44]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_44:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_44:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_44]][0], [[SCALE_SCALAR_44]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_44:%.+]] = vector.extract_strided_slice [[PACKED_44]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_44:%.+]] = vector.shape_cast [[OUT_SLICE_44]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_44]], [[ACC_B]] {offsets = [5, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_45:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_45:%.+]] = vector.shape_cast [[IN_SLICE_45]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_45:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_45:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_45]][0], [[SCALE_SCALAR_45]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_45:%.+]] = vector.extract_strided_slice [[PACKED_45]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_45:%.+]] = vector.shape_cast [[OUT_SLICE_45]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_45]], [[ACC_A]] {offsets = [5, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_46:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_46:%.+]] = vector.shape_cast [[IN_SLICE_46]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_46:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_46:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_46]][0], [[SCALE_SCALAR_46]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_46:%.+]] = vector.extract_strided_slice [[PACKED_46]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_46:%.+]] = vector.shape_cast [[OUT_SLICE_46]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_46]], [[ACC_B]] {offsets = [5, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_47:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_47:%.+]] = vector.shape_cast [[IN_SLICE_47]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_47:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_47:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_47]][0], [[SCALE_SCALAR_47]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_47:%.+]] = vector.extract_strided_slice [[PACKED_47]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_47:%.+]] = vector.shape_cast [[OUT_SLICE_47]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_47]], [[ACC_A]] {offsets = [5, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_48:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_48:%.+]] = vector.shape_cast [[IN_SLICE_48]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_48:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_48:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_48]][0], [[SCALE_SCALAR_48]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_48:%.+]] = vector.extract_strided_slice [[PACKED_48]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_48:%.+]] = vector.shape_cast [[OUT_SLICE_48]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_48]], [[ACC_B]] {offsets = [6, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_49:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_49:%.+]] = vector.shape_cast [[IN_SLICE_49]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_49:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_49:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_49]][0], [[SCALE_SCALAR_49]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_49:%.+]] = vector.extract_strided_slice [[PACKED_49]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_49:%.+]] = vector.shape_cast [[OUT_SLICE_49]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_49]], [[ACC_A]] {offsets = [6, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_50:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_50:%.+]] = vector.shape_cast [[IN_SLICE_50]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_50:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_50:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_50]][0], [[SCALE_SCALAR_50]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_50:%.+]] = vector.extract_strided_slice [[PACKED_50]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_50:%.+]] = vector.shape_cast [[OUT_SLICE_50]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_50]], [[ACC_B]] {offsets = [6, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_51:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_51:%.+]] = vector.shape_cast [[IN_SLICE_51]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_51:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_51:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_51]][0], [[SCALE_SCALAR_51]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_51:%.+]] = vector.extract_strided_slice [[PACKED_51]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_51:%.+]] = vector.shape_cast [[OUT_SLICE_51]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_51]], [[ACC_A]] {offsets = [6, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_52:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_52:%.+]] = vector.shape_cast [[IN_SLICE_52]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_52:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_52:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_52]][0], [[SCALE_SCALAR_52]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_52:%.+]] = vector.extract_strided_slice [[PACKED_52]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_52:%.+]] = vector.shape_cast [[OUT_SLICE_52]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_52]], [[ACC_B]] {offsets = [6, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_53:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_53:%.+]] = vector.shape_cast [[IN_SLICE_53]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_53:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_53:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_53]][0], [[SCALE_SCALAR_53]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_53:%.+]] = vector.extract_strided_slice [[PACKED_53]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_53:%.+]] = vector.shape_cast [[OUT_SLICE_53]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_53]], [[ACC_A]] {offsets = [6, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_54:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_54:%.+]] = vector.shape_cast [[IN_SLICE_54]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_54:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_54:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_54]][0], [[SCALE_SCALAR_54]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_54:%.+]] = vector.extract_strided_slice [[PACKED_54]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_54:%.+]] = vector.shape_cast [[OUT_SLICE_54]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_54]], [[ACC_B]] {offsets = [6, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_55:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_55:%.+]] = vector.shape_cast [[IN_SLICE_55]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_55:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_55:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_55]][0], [[SCALE_SCALAR_55]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_55:%.+]] = vector.extract_strided_slice [[PACKED_55]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_55:%.+]] = vector.shape_cast [[OUT_SLICE_55]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_55]], [[ACC_A]] {offsets = [6, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_56:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_56:%.+]] = vector.shape_cast [[IN_SLICE_56]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_56:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_56:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_56]][0], [[SCALE_SCALAR_56]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_56:%.+]] = vector.extract_strided_slice [[PACKED_56]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_56:%.+]] = vector.shape_cast [[OUT_SLICE_56]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_56]], [[ACC_B]] {offsets = [7, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_57:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_57:%.+]] = vector.shape_cast [[IN_SLICE_57]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_57:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_57:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_57]][0], [[SCALE_SCALAR_57]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_57:%.+]] = vector.extract_strided_slice [[PACKED_57]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_57:%.+]] = vector.shape_cast [[OUT_SLICE_57]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_57]], [[ACC_A]] {offsets = [7, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_58:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_58:%.+]] = vector.shape_cast [[IN_SLICE_58]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_58:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_58:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_58]][0], [[SCALE_SCALAR_58]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_58:%.+]] = vector.extract_strided_slice [[PACKED_58]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_58:%.+]] = vector.shape_cast [[OUT_SLICE_58]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_58]], [[ACC_B]] {offsets = [7, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_59:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_59:%.+]] = vector.shape_cast [[IN_SLICE_59]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_59:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_59:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_59]][0], [[SCALE_SCALAR_59]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_59:%.+]] = vector.extract_strided_slice [[PACKED_59]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_59:%.+]] = vector.shape_cast [[OUT_SLICE_59]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_59]], [[ACC_A]] {offsets = [7, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_60:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_60:%.+]] = vector.shape_cast [[IN_SLICE_60]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_60:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 0] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_60:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_60]][0], [[SCALE_SCALAR_60]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_60:%.+]] = vector.extract_strided_slice [[PACKED_60]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_60:%.+]] = vector.shape_cast [[OUT_SLICE_60]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_60]], [[ACC_B]] {offsets = [7, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_61:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_61:%.+]] = vector.shape_cast [[IN_SLICE_61]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_61:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 1] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_61:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_61]][0], [[SCALE_SCALAR_61]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_61:%.+]] = vector.extract_strided_slice [[PACKED_61]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_61:%.+]] = vector.shape_cast [[OUT_SLICE_61]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_61]], [[ACC_A]] {offsets = [7, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_62:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_62:%.+]] = vector.shape_cast [[IN_SLICE_62]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_62:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 2] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_62:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_62]][0], [[SCALE_SCALAR_62]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_62:%.+]] = vector.extract_strided_slice [[PACKED_62]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_62:%.+]] = vector.shape_cast [[OUT_SLICE_62]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_62]], [[ACC_B]] {offsets = [7, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[IN_SLICE_63:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
+// CHECK-NEXT:    [[IN_VEC_63:%.+]] = vector.shape_cast [[IN_SLICE_63]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    [[SCALE_SCALAR_63:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 3] : f32 from vector<8x2x4xf32>
+// CHECK-NEXT:    [[PACKED_63:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_63]][0], [[SCALE_SCALAR_63]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[OUT_SLICE_63:%.+]] = vector.extract_strided_slice [[PACKED_63]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    [[OUT_VEC_63:%.+]] = vector.shape_cast [[OUT_SLICE_63]] : vector<1xf32> to vector<1x1x1xf32>
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_63]], [[ACC_A]] {offsets = [7, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
+// CHECK-NEXT:    [[FINAL_CAST:%.+]] = vector.shape_cast [[ACC_B]] : vector<8x2x4xf32> to vector<8x8xf32>
+// CHECK-NEXT:    return [[FINAL_CAST]] : vector<8x8xf32>
+func.func @conversion_broadcast(%in: vector<8x8xf8E5M2>, %scale: vector<8x2xf8E8M0FNU>) -> vector<8x8xf32> {
+    %bc = vector.broadcast %scale : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
+    %cast1 = vector.shape_cast %in : vector<8x8xf8E5M2> to vector<8x2x4xf8E5M2>
+    %cast2 = vector.shape_cast %bc : vector<4x8x2xf8E8M0FNU> to vector<8x2x4xf8E8M0FNU>
+    %ext = arith.scaling_extf %cast1, %cast2 : vector<8x2x4xf8E5M2>, vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
+    %cast3 = vector.shape_cast %ext : vector<8x2x4xf32> to vector<8x8xf32>
+    return %cast3 : vector<8x8xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_scalar
+// CHECK:         [[SCALE_F32:%.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// CHECK-NEXT:    [[SPLAT_IN:%.+]] = vector.splat %arg0 : vector<1xf8E5M2>
+// CHECK-NEXT:    [[PACKED_EXT:%.+]] = amdgpu.scaled_ext_packed [[SPLAT_IN]][0], [[SCALE_F32]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    [[RESULT:%.+]] = vector.extract [[PACKED_EXT]][0] : f32 from vector<2xf32>
+// CHECK-NEXT:    return [[RESULT]] : f32
+func.func @conversion_scalar(%in: f8E5M2, %scale: f8E8M0FNU) -> f32 {
+    %ext = arith.scaling_extf %in, %scale : f8E5M2, f8E8M0FNU to f32
+    return %ext : f32
+}
diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
new file mode 100644
index 0000000000000..2dcb0c554cc6f
--- /dev/null
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
@@ -0,0 +1,557 @@
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx950" | FileCheck %s
+
+// CHECK-LABEL: @conversion_f8_fallback
+// CHECK-DAG:     [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf8E5M2>
+// CHECK-DAG:     [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK:         [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_00:%.+]] = vector.shape_cast [[IN_SLICE_00]]
+// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0]
+// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_00]] into undef[0], [[SCALE_SCALAR_00]]
+// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]]
+// CHECK-NEXT:    [[OUT_SCALAR_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]]
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_00]], [[CST]]
+// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_01:%.+]] = vector.shape_cast [[IN_SLICE_01]]
+// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1]
+// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_01]] into undef[0], [[SCALE_SCALAR_01]]
+// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]]
+// CHECK-NEXT:    [[OUT_SCALAR_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]]
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_01]], [[ACC_A]]
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_10:%.+]] = vector.shape_cast [[IN_SLICE_10]]
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0]
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_10]] into undef[0], [[SCALE_SCALAR_10]]
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]]
+// CHECK-NEXT:    [[OUT_SCALAR_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]]
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_10]], [[ACC_B]]
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_11:%.+]] = vector.shape_cast [[IN_SLICE_11]]
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1]
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_11]] into undef[0], [[SCALE_SCALAR_11]]
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]]
+// CHECK-NEXT:    [[OUT_SCALAR_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]]
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_11]], [[ACC_A]]
+// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf8E5M2>
+func.func @conversion_f8_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf8E5M2> {
+    %ext = arith.scaling_truncf %in, %scale : vector<2x2xf32>, vector<2x2xf8E8M0FNU> to vector<2x2xf8E5M2>
+    return %ext : vector<2x2xf8E5M2>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_f4_fallback
+// CHECK-DAG:     [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf4E2M1FN>
+// CHECK-DAG:     [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK:         [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_00:%.+]] = vector.shape_cast [[IN_SLICE_00]]
+// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0]
+// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_00]] into undef[0], [[SCALE_SCALAR_00]]
+// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]]
+// CHECK-NEXT:    [[OUT_SCALAR_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]]
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_00]], [[CST]]
+// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_01:%.+]] = vector.shape_cast [[IN_SLICE_01]]
+// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1]
+// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_01]] into undef[0], [[SCALE_SCALAR_01]]
+// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]]
+// CHECK-NEXT:    [[OUT_SCALAR_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]]
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_01]], [[ACC_A]]
+// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_10:%.+]] = vector.shape_cast [[IN_SLICE_10]]
+// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0]
+// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_10]] into undef[0], [[SCALE_SCALAR_10]]
+// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]]
+// CHECK-NEXT:    [[OUT_SCALAR_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]]
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_10]], [[ACC_B]]
+// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    [[IN_SCALAR_11:%.+]] = vector.shape_cast [[IN_SLICE_11]]
+// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1]
+// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_11]] into undef[0], [[SCALE_SCALAR_11]]
+// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]]
+// CHECK-NEXT:    [[OUT_SCALAR_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]]
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_11]], [[ACC_A]]
+// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf4E2M1FN>
+func.func @conversion_f4_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf4E2M1FN> {
+    %ext = arith.scaling_truncf %in, %scale : vector<2x2xf32>, vector<2x2xf8E8M0FNU> to vector<2x2xf4E2M1FN>
+    return %ext : vector<2x2xf4E2M1FN>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_broadcast
+// CHECK-DAG:     [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf8E5M2>
+// CHECK-DAG:     [[BCAST:%.+]] = vector.broadcast %arg1
+// CHECK-DAG:     [[IN_CAST:%.+]] = vector.shape_cast %arg0 : vector<8x8xf32> to vector<8x2x4xf32>
+// CHECK-DAG:     [[SCALE_CAST:%.+]] = vector.shape_cast [[BCAST]]
+// CHECK-DAG:     [[SCALE_EXT:%.+]] = arith.extf [[SCALE_CAST]] : vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[CST]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 0]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 1]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 2]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 2]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
+// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 3]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 3]
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
+// CHECK-NEXT:    [[FINAL_CAST:%.+]] = vector.shape_cast [[ACC_B]]
+// CHECK-NEXT:    return [[FINAL_CAST]] : vector<8x8xf8E5M2>
+func.func @conversion_broadcast(%in: vector<8x8xf32>, %scale: vector<8x2xf8E8M0FNU>) -> vector<8x8xf8E5M2> {
+    %bc = vector.broadcast %scale : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
+    %cast1 = vector.shape_cast %in : vector<8x8xf32> to vector<8x2x4xf32>
+    %cast2 = vector.shape_cast %bc : vector<4x8x2xf8E8M0FNU> to vector<8x2x4xf8E8M0FNU>
+    %ext = arith.scaling_truncf %cast1, %cast2 : vector<8x2x4xf32>, vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf8E5M2>
+    %cast3 = vector.shape_cast %ext : vector<8x2x4xf8E5M2> to vector<8x8xf8E5M2>
+    return %cast3 : vector<8x8xf8E5M2>
+}
+
+// -----
+
+// CHECK-LABEL: @conversion_scalar
+// CHECK:         [[SCALE_F32:%.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// CHECK-NEXT:    [[SPLAT_IN:%.+]] = vector.splat %arg0 : vector<1xf32>
+// CHECK-NEXT:    [[PACKED_TRUNC:%.+]] = amdgpu.packed_scaled_trunc [[SPLAT_IN]] into undef[0], [[SCALE_F32]]
+// CHECK-NEXT:    [[RESULT:%.+]] = vector.extract [[PACKED_TRUNC]][0]
+// CHECK-NEXT:    return [[RESULT]] : f8E5M2
+func.func @conversion_scalar(%in: f32, %scale: f8E8M0FNU) -> f8E5M2 {
+    %ext = arith.scaling_truncf %in, %scale : f32, f8E8M0FNU to f8E5M2
+    return %ext : f8E5M2
+}

>From 01eb7f01bbdb0b93489be68bedf9dfeeae0c980a Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Mon, 30 Jun 2025 20:12:25 +0000
Subject: [PATCH 03/17] add todo

---
 mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 2178499031e14..ab320fb6ffeec 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -482,6 +482,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
   if (!outVecType) {
     Value inCast =
         rewriter.create<vector::SplatOp>(loc, VectorType::get(1, inType), in);
+    // TODO: replace this with non-packed ScaledExtOp
     Value scaleExt = rewriter.create<amdgpu::ScaledExtPackedOp>(
         loc, extScaleResultType, inCast, scale, 0);
     scaleExt = rewriter.replaceOpWithNewOp<vector::ExtractOp>(op, scaleExt, 0);
@@ -530,6 +531,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
          i += sliceWidth, sliceWidth = opWidth - blockSize % opWidth) {
       Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
           loc, block1D, i, sliceWidth, 1);
+      // TODO: replace this with non-packed ScaledExtOp for sliceWidth == 1
       Value scaleExt = rewriter.create<amdgpu::ScaledExtPackedOp>(
           loc, extScaleResultType, slice, uniformScale, 0);
       if (sliceWidth != opWidth)
@@ -586,10 +588,8 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
 
   if (!outVecType) {
     Type inVecType = VectorType::get(1, inType);
-    // Type exisingVecType = VectorType::get(opWidth, outType);
     Value inCast = rewriter.create<vector::SplatOp>(loc, inVecType, in);
-    // Value existing =
-    //     rewriter.createOrFold<vector::SplatOp>(loc, exisingVecType, zero);
+    // TODO: replace this with non-packed ScaledTruncOp
     Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
         loc, truncScaleResultType, inCast, scale, 0, /*existing=*/nullptr);
     scaleTrunc =
@@ -637,9 +637,7 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
          i += sliceWidth, sliceWidth = opWidth - blockSize % opWidth) {
       Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
           loc, block1D, i, sliceWidth, 1);
-      // VectorType exisingVecType = VectorType::get(opWidth, outType);
-      // Value existing =
-      //     rewriter.createOrFold<vector::SplatOp>(loc, exisingVecType, zero);
+      // TODO: replace this with non-packed ScaledTruncOp for sliceWidth == 1
       Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
           loc, truncScaleResultType, slice, uniformScale, 0,
           /*existing=*/nullptr);

>From ef0816b0503f064cb044d97c584c91dafccaa8ac Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 1 Jul 2025 11:35:45 +0000
Subject: [PATCH 04/17] remove unused includes

---
 mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index ab320fb6ffeec..06a221449c352 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -19,13 +19,10 @@
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/LogicalResult.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_ARITHTOAMDGPUCONVERSIONPASS

>From ba77b73ca2e014fccb7c72eaffaf7d79ef564069 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 1 Jul 2025 11:36:50 +0000
Subject: [PATCH 05/17] document getOriginalVectorValue

Signed-off-by: Tim Gymnich <tim at gymni.ch>
---
 mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 06a221449c352..672c6784f1771 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -421,6 +421,7 @@ LogicalResult TruncfToFloat16RewritePattern::matchAndRewrite(
   return success();
 }
 
+/// Get the broadcasted / splatted value for a chain of ops.
 static Value getOriginalVectorValue(Value value) {
   Value current = value;
   while (Operation *definingOp = current.getDefiningOp()) {

>From bbb286f9a5fac9c089cedec67acf2377a11b1b13 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 1 Jul 2025 11:39:40 +0000
Subject: [PATCH 06/17] remove const

Signed-off-by: Tim Gymnich <tim at gymni.ch>
---
 mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 672c6784f1771..27a2dbabfe70c 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -451,7 +451,7 @@ LogicalResult
 ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
                                            PatternRewriter &rewriter) const {
   Location loc = op.getLoc();
-  constexpr const int64_t opWidth = 2;
+  constexpr int64_t opWidth = 2;
 
   Value in = op.getIn();
   Value scale = op.getScale();
@@ -555,7 +555,7 @@ LogicalResult
 ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
                                              PatternRewriter &rewriter) const {
   Location loc = op.getLoc();
-  constexpr const int64_t opWidth = 2;
+  constexpr int64_t opWidth = 2;
 
   Value in = op.getIn();
   Value scale = op.getScale();

>From 261f98577dea430ded832b9780675b6c2ea64330 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 1 Jul 2025 11:40:40 +0000
Subject: [PATCH 07/17] check dyn_casts

Signed-off-by: Tim Gymnich <tim at gymni.ch>
---
 .../ArithToAMDGPU/ArithToAMDGPU.cpp           | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 27a2dbabfe70c..06fcabbdfda5d 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -461,9 +461,9 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
   Type inType = getElementTypeOrSelf(in);
   Type scaleType = getElementTypeOrSelf(scale);
   Type outType = getElementTypeOrSelf(out);
-  VectorType scaleVecType = dyn_cast<VectorType>(scale.getType());
-  VectorType inVecType = dyn_cast<VectorType>(in.getType());
+
   VectorType outVecType = dyn_cast<VectorType>(out.getType());
+  VectorType scaleVecType = dyn_cast<VectorType>(scale.getType());
 
   if (outVecType && outVecType.isScalable())
     return failure();
@@ -487,14 +487,14 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
     return success();
   }
 
+  VectorType inVecType = cast<VectorType>(in.getType());
   Value origScale = getOriginalVectorValue(scale);
-  Type origScaleType = origScale.getType();
-  VectorType origScaleVecType = isa<VectorType>(origScaleType)
-                                    ? cast<VectorType>(origScaleType)
-                                    : VectorType::get(1, origScaleType);
 
-  ArrayRef<int64_t> originalScaleShape = origScaleVecType.getShape();
+  int64_t scalarShape[1] = {1};
   ArrayRef<int64_t> inShape = inVecType.getShape();
+  ArrayRef<int64_t> originalScaleShape = {scalarShape};
+  if (auto origScaleVecType = dyn_cast<VectorType>(origScale.getType()))
+    originalScaleShape = origScaleVecType.getShape();
 
   SmallVector<int64_t> paddedScaleShape(originalScaleShape);
   paddedScaleShape.insert(paddedScaleShape.end(),
@@ -565,9 +565,9 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
   Type inType = getElementTypeOrSelf(in);
   Type scaleType = getElementTypeOrSelf(scale);
   Type outType = getElementTypeOrSelf(out);
-  VectorType scaleVecType = dyn_cast<VectorType>(scale.getType());
-  VectorType inVecType = dyn_cast<VectorType>(in.getType());
+
   VectorType outVecType = dyn_cast<VectorType>(out.getType());
+  VectorType scaleVecType = dyn_cast<VectorType>(scale.getType());
 
   if (outVecType && outVecType.isScalable())
     return failure();
@@ -595,14 +595,14 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
     return success();
   }
 
+  VectorType inVecType = cast<VectorType>(in.getType());
   Value origScale = getOriginalVectorValue(scale);
-  Type origScaleType = origScale.getType();
-  VectorType origScaleVecType = isa<VectorType>(origScaleType)
-                                    ? cast<VectorType>(origScaleType)
-                                    : VectorType::get(1, origScaleType);
 
-  ArrayRef<int64_t> originalScaleShape = origScaleVecType.getShape();
+  int64_t scalarShape[1] = {1};
   ArrayRef<int64_t> inShape = inVecType.getShape();
+  ArrayRef<int64_t> originalScaleShape = {scalarShape};
+  if (auto origScaleVecType = dyn_cast<VectorType>(origScale.getType()))
+    originalScaleShape = origScaleVecType.getShape();
 
   SmallVector<int64_t> paddedScaleShape(originalScaleShape);
   paddedScaleShape.insert(paddedScaleShape.end(),

>From fe4c74418e1679611ad4082e558bcedf90350ce0 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 1 Jul 2025 11:41:15 +0000
Subject: [PATCH 08/17] replace failure with assertion and add fallback

Signed-off-by: Tim Gymnich <tim at gymni.ch>
---
 .../ArithToAMDGPU/ArithToAMDGPU.cpp           | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 06fcabbdfda5d..0955b0c3e7752 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -500,20 +500,23 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
   paddedScaleShape.insert(paddedScaleShape.end(),
                           inShape.size() - originalScaleShape.size(), 1);
 
-  auto ratio = computeShapeRatio(inShape, paddedScaleShape);
-  if (!ratio)
-    return failure();
+  auto maybeRatio = computeShapeRatio(inShape, paddedScaleShape);
+  assert(maybeRatio &&
+         "failed to derive block size from broadcast or splat operation");
+
+  SmallVector<int64_t> ratio =
+      maybeRatio.value_or(SmallVector<int64_t>(inShape.size(), 1));
 
-  const int64_t blockSize = computeProduct(*ratio);
+  int64_t blockSize = computeProduct(ratio);
 
   Value zero = rewriter.create<arith::ConstantOp>(
       loc, outType, rewriter.getFloatAttr(outType, 0.0));
   Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
 
-  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, *ratio)) {
+  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, ratio)) {
     SmallVector<int64_t> strides(offsets.size(), 1);
     Value block = rewriter.create<vector::ExtractStridedSliceOp>(
-        loc, in, offsets, *ratio, strides);
+        loc, in, offsets, ratio, strides);
     VectorType block1DType = VectorType::get(blockSize, inType);
     Value block1D =
         rewriter.create<vector::ShapeCastOp>(loc, block1DType, block);
@@ -539,7 +542,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
           loc, scaleExt, blockResult, i, 1);
     }
 
-    VectorType resultType = VectorType::get(*ratio, outType);
+    VectorType resultType = VectorType::get(ratio, outType);
     Value cast =
         rewriter.create<vector::ShapeCastOp>(loc, resultType, blockResult);
     result = rewriter.create<vector::InsertStridedSliceOp>(loc, cast, result,
@@ -608,18 +611,21 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
   paddedScaleShape.insert(paddedScaleShape.end(),
                           inShape.size() - originalScaleShape.size(), 1);
 
-  auto ratio = computeShapeRatio(inShape, paddedScaleShape);
-  if (!ratio)
-    return failure();
+  auto maybeRatio = computeShapeRatio(inShape, paddedScaleShape);
+  assert(maybeRatio &&
+         "failed to derive block size from broadcast or splat operation");
+
+  SmallVector<int64_t> ratio =
+      maybeRatio.value_or(SmallVector<int64_t>(inShape.size(), 1));
 
-  const int64_t blockSize = computeProduct(*ratio);
+  int64_t blockSize = computeProduct(ratio);
 
   Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
 
-  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, *ratio)) {
+  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, ratio)) {
     SmallVector<int64_t> strides(offsets.size(), 1);
     Value block = rewriter.create<vector::ExtractStridedSliceOp>(
-        loc, in, offsets, *ratio, strides);
+        loc, in, offsets, ratio, strides);
     VectorType block1DType = VectorType::get(blockSize, inType);
     Value block1D =
         rewriter.create<vector::ShapeCastOp>(loc, block1DType, block);
@@ -646,7 +652,7 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
           loc, scaleTrunc, blockResult, i, 1);
     }
 
-    VectorType resultType = VectorType::get(*ratio, outType);
+    VectorType resultType = VectorType::get(ratio, outType);
     Value cast =
         rewriter.create<vector::ShapeCastOp>(loc, resultType, blockResult);
     result = rewriter.create<vector::InsertStridedSliceOp>(loc, cast, result,

>From 33e98fdfc34eafff451d833623913f557541d2d5 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 1 Jul 2025 12:42:31 +0000
Subject: [PATCH 09/17] simplify tests

---
 .../ArithToAMDGPU/scaling-extf.mlir           | 733 ++++--------------
 .../ArithToAMDGPU/scaling-truncf.mlir         | 592 ++------------
 2 files changed, 227 insertions(+), 1098 deletions(-)

diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
index 47266f55f9cf3..40c7e3ef6b267 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
@@ -1,37 +1,37 @@
 // RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx950" | FileCheck %s
 
 // CHECK-LABEL: @conversion_f8_f32_fallback
-// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
-// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf32>
+// CHECK:         %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
+// CHECK-NEXT:    %[[SCALE_EXT:.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_00:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_00:.+]] = vector.shape_cast %[[IN_SLICE_00]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_00:.+]] = vector.extract %[[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_00:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_00]][0], %[[SCALE_SCALAR_00]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_00:.+]] = vector.extract_strided_slice %[[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_00:.+]] = vector.shape_cast %[[OUT_SLICE_00]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_00]], %[[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_01:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_01:.+]] = vector.shape_cast %[[IN_SLICE_01]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_01:.+]] = vector.extract %[[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_01:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_01]][0], %[[SCALE_SCALAR_01]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_01:.+]] = vector.extract_strided_slice %[[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_01:.+]] = vector.shape_cast %[[OUT_SLICE_01]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_01]], %[[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_10:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_10:.+]] = vector.shape_cast %[[IN_SLICE_10]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_10:.+]] = vector.extract %[[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_10:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_10]][0], %[[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_10:.+]] = vector.extract_strided_slice %[[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_10:.+]] = vector.shape_cast %[[OUT_SLICE_10]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_10]], %[[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_11:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_11:.+]] = vector.shape_cast %[[IN_SLICE_11]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_11:.+]] = vector.extract %[[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_11:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_11]][0], %[[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_11:.+]] = vector.extract_strided_slice %[[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_11:.+]] = vector.shape_cast %[[OUT_SLICE_11]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_11]], %[[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    return %[[ACC_B]] : vector<2x2xf32>
 func.func @conversion_f8_f32_fallback(%in: vector<2x2xf8E5M2>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf32> {
     %ext = arith.scaling_extf %in, %scale : vector<2x2xf8E5M2>, vector<2x2xf8E8M0FNU> to vector<2x2xf32>
     return %ext : vector<2x2xf32>
@@ -40,37 +40,37 @@ func.func @conversion_f8_f32_fallback(%in: vector<2x2xf8E5M2>, %scale: vector<2x
 // -----
 
 // CHECK-LABEL: @conversion_f4_f32_fallback
-// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
-// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf4E2M1FN> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf4E2M1FN> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf4E2M1FN> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf4E2M1FN> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf32> to vector<1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
-// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf32>
+// CHECK:         %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
+// CHECK-NEXT:    %[[SCALE_EXT:.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_00:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_00:.+]] = vector.shape_cast %[[IN_SLICE_00]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_00:.+]] = vector.extract %[[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_00:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_00]][0], %[[SCALE_SCALAR_00]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_00:.+]] = vector.extract_strided_slice %[[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_00:.+]] = vector.shape_cast %[[OUT_SLICE_00]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_00]], %[[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_01:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_01:.+]] = vector.shape_cast %[[IN_SLICE_01]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_01:.+]] = vector.extract %[[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_01:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_01]][0], %[[SCALE_SCALAR_01]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_01:.+]] = vector.extract_strided_slice %[[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_01:.+]] = vector.shape_cast %[[OUT_SLICE_01]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_01]], %[[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_10:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_10:.+]] = vector.shape_cast %[[IN_SLICE_10]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_10:.+]] = vector.extract %[[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_10:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_10]][0], %[[SCALE_SCALAR_10]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_10:.+]] = vector.extract_strided_slice %[[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_10:.+]] = vector.shape_cast %[[OUT_SLICE_10]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_10]], %[[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_11:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_11:.+]] = vector.shape_cast %[[IN_SLICE_11]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_11:.+]] = vector.extract %[[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_11:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_11]][0], %[[SCALE_SCALAR_11]] : vector<1xf4E2M1FN> to vector<2xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_11:.+]] = vector.extract_strided_slice %[[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_VEC_11:.+]] = vector.shape_cast %[[OUT_SLICE_11]] : vector<1xf32> to vector<1x1xf32>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_11]], %[[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf32> into vector<2x2xf32>
+// CHECK-NEXT:    return %[[ACC_B]] : vector<2x2xf32>
 func.func @conversion_f4_f32_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf32> {
     %ext = arith.scaling_extf %in, %scale : vector<2x2xf4E2M1FN>, vector<2x2xf8E8M0FNU> to vector<2x2xf32>
     return %ext : vector<2x2xf32>
@@ -79,37 +79,37 @@ func.func @conversion_f4_f32_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<
 // -----
 
 // CHECK-LABEL: @conversion_f8_f16_fallback
-// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
-// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf8E5M2> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf8E5M2> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf16>
+// CHECK:         %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// CHECK-NEXT:    %[[SCALE_EXT:.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_00:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_00:.+]] = vector.shape_cast %[[IN_SLICE_00]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_00:.+]] = vector.extract %[[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_00:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_00]][0], %[[SCALE_SCALAR_00]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_00:.+]] = vector.extract_strided_slice %[[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_00:.+]] = vector.shape_cast %[[OUT_SLICE_00]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_00]], %[[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    %[[IN_SLICE_01:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_01:.+]] = vector.shape_cast %[[IN_SLICE_01]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_01:.+]] = vector.extract %[[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_01:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_01]][0], %[[SCALE_SCALAR_01]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_01:.+]] = vector.extract_strided_slice %[[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_01:.+]] = vector.shape_cast %[[OUT_SLICE_01]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_01]], %[[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    %[[IN_SLICE_10:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_10:.+]] = vector.shape_cast %[[IN_SLICE_10]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_10:.+]] = vector.extract %[[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_10:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_10]][0], %[[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_10:.+]] = vector.extract_strided_slice %[[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_10:.+]] = vector.shape_cast %[[OUT_SLICE_10]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_10]], %[[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    %[[IN_SLICE_11:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf8E5M2> to vector<1x1xf8E5M2>
+// CHECK-NEXT:    %[[IN_VEC_11:.+]] = vector.shape_cast %[[IN_SLICE_11]] : vector<1x1xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_11:.+]] = vector.extract %[[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_11:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_11]][0], %[[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_11:.+]] = vector.extract_strided_slice %[[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_11:.+]] = vector.shape_cast %[[OUT_SLICE_11]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_11]], %[[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    return %[[ACC_B]] : vector<2x2xf16>
 func.func @conversion_f8_f16_fallback(%in: vector<2x2xf8E5M2>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf16> {
     %ext = arith.scaling_extf %in, %scale : vector<2x2xf8E5M2>, vector<2x2xf8E8M0FNU> to vector<2x2xf16>
     return %ext : vector<2x2xf16>
@@ -118,37 +118,37 @@ func.func @conversion_f8_f16_fallback(%in: vector<2x2xf8E5M2>, %scale: vector<2x
 // -----
 
 // CHECK-LABEL: @conversion_f4_f16_fallback
-// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
-// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
-// CHECK-NEXT:    [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_00:%.+]] = vector.shape_cast [[IN_SLICE_00]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_00]][0], [[SCALE_SCALAR_00]] : vector<1xf4E2M1FN> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_00]], [[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_01:%.+]] = vector.shape_cast [[IN_SLICE_01]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_01]][0], [[SCALE_SCALAR_01]] : vector<1xf4E2M1FN> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_01]], [[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf4E2M1FN> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
-// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf4E2M1FN> to vector<2xf16>
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
-// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf16> to vector<1x1xf16>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
-// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf16>
+// CHECK:         %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// CHECK-NEXT:    %[[SCALE_EXT:.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK-NEXT:    %[[IN_SLICE_00:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_00:.+]] = vector.shape_cast %[[IN_SLICE_00]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_00:.+]] = vector.extract %[[SCALE_EXT]][0, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_00:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_00]][0], %[[SCALE_SCALAR_00]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_00:.+]] = vector.extract_strided_slice %[[PACKED_00]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_00:.+]] = vector.shape_cast %[[OUT_SLICE_00]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_00]], %[[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    %[[IN_SLICE_01:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_01:.+]] = vector.shape_cast %[[IN_SLICE_01]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_01:.+]] = vector.extract %[[SCALE_EXT]][0, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_01:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_01]][0], %[[SCALE_SCALAR_01]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_01:.+]] = vector.extract_strided_slice %[[PACKED_01]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_01:.+]] = vector.shape_cast %[[OUT_SLICE_01]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_01]], %[[ACC_A]] {offsets = [0, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    %[[IN_SLICE_10:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_10:.+]] = vector.shape_cast %[[IN_SLICE_10]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_10:.+]] = vector.extract %[[SCALE_EXT]][1, 0] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_10:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_10]][0], %[[SCALE_SCALAR_10]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_10:.+]] = vector.extract_strided_slice %[[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_10:.+]] = vector.shape_cast %[[OUT_SLICE_10]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_VEC_10]], %[[ACC_B]] {offsets = [1, 0], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    %[[IN_SLICE_11:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]} : vector<2x2xf4E2M1FN> to vector<1x1xf4E2M1FN>
+// CHECK-NEXT:    %[[IN_VEC_11:.+]] = vector.shape_cast %[[IN_SLICE_11]] : vector<1x1xf4E2M1FN> to vector<1xf4E2M1FN>
+// CHECK-NEXT:    %[[SCALE_SCALAR_11:.+]] = vector.extract %[[SCALE_EXT]][1, 1] : f32 from vector<2x2xf32>
+// CHECK-NEXT:    %[[PACKED_11:.+]] = amdgpu.scaled_ext_packed %[[IN_VEC_11]][0], %[[SCALE_SCALAR_11]] : vector<1xf4E2M1FN> to vector<2xf16>
+// CHECK-NEXT:    %[[OUT_SLICE_11:.+]] = vector.extract_strided_slice %[[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+// CHECK-NEXT:    %[[OUT_VEC_11:.+]] = vector.shape_cast %[[OUT_SLICE_11]] : vector<1xf16> to vector<1x1xf16>
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_VEC_11]], %[[ACC_A]] {offsets = [1, 1], strides = [1, 1]} : vector<1x1xf16> into vector<2x2xf16>
+// CHECK-NEXT:    return %[[ACC_B]] : vector<2x2xf16>
 func.func @conversion_f4_f16_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf16> {
     %ext = arith.scaling_extf %in, %scale : vector<2x2xf4E2M1FN>, vector<2x2xf8E8M0FNU> to vector<2x2xf16>
     return %ext : vector<2x2xf16>
@@ -157,461 +157,26 @@ func.func @conversion_f4_f16_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<
 // -----
 
 // CHECK-LABEL: @conversion_broadcast
-// CHECK:         [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf32>
-// CHECK-NEXT:    [[BCAST:%.+]] = vector.broadcast %arg1 : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
-// CHECK-NEXT:    [[IN_CAST:%.+]] = vector.shape_cast %arg0 : vector<8x8xf8E5M2> to vector<8x2x4xf8E5M2>
-// CHECK-NEXT:    [[SCALE_CAST:%.+]] = vector.shape_cast [[BCAST]] : vector<4x8x2xf8E8M0FNU> to vector<8x2x4xf8E8M0FNU>
-// CHECK-NEXT:    [[SCALE_EXT:%.+]] = arith.extf [[SCALE_CAST]] : vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_0:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_0:%.+]] = vector.shape_cast [[IN_SLICE_0]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_0:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_0:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_0]][0], [[SCALE_SCALAR_0]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_0:%.+]] = vector.extract_strided_slice [[PACKED_0]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_0:%.+]] = vector.shape_cast [[OUT_SLICE_0]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_0]], [[CST]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_1:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_1:%.+]] = vector.shape_cast [[IN_SLICE_1]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_1:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_1:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_1]][0], [[SCALE_SCALAR_1]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_1:%.+]] = vector.extract_strided_slice [[PACKED_1]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_1:%.+]] = vector.shape_cast [[OUT_SLICE_1]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_1]], [[ACC_A]] {offsets = [0, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_2:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_2:%.+]] = vector.shape_cast [[IN_SLICE_2]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_2:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_2:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_2]][0], [[SCALE_SCALAR_2]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_2:%.+]] = vector.extract_strided_slice [[PACKED_2]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_2:%.+]] = vector.shape_cast [[OUT_SLICE_2]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_2]], [[ACC_B]] {offsets = [0, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_3:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_3:%.+]] = vector.shape_cast [[IN_SLICE_3]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_3:%.+]] = vector.extract [[SCALE_EXT]][0, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_3:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_3]][0], [[SCALE_SCALAR_3]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_3:%.+]] = vector.extract_strided_slice [[PACKED_3]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_3:%.+]] = vector.shape_cast [[OUT_SLICE_3]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_3]], [[ACC_A]] {offsets = [0, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_4:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_4:%.+]] = vector.shape_cast [[IN_SLICE_4]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_4:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_4:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_4]][0], [[SCALE_SCALAR_4]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_4:%.+]] = vector.extract_strided_slice [[PACKED_4]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_4:%.+]] = vector.shape_cast [[OUT_SLICE_4]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_4]], [[ACC_B]] {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_5:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_5:%.+]] = vector.shape_cast [[IN_SLICE_5]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_5:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_5:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_5]][0], [[SCALE_SCALAR_5]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_5:%.+]] = vector.extract_strided_slice [[PACKED_5]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_5:%.+]] = vector.shape_cast [[OUT_SLICE_5]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_5]], [[ACC_A]] {offsets = [0, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_6:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_6:%.+]] = vector.shape_cast [[IN_SLICE_6]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_6:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_6:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_6]][0], [[SCALE_SCALAR_6]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_6:%.+]] = vector.extract_strided_slice [[PACKED_6]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_6:%.+]] = vector.shape_cast [[OUT_SLICE_6]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_6]], [[ACC_B]] {offsets = [0, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_7:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_7:%.+]] = vector.shape_cast [[IN_SLICE_7]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_7:%.+]] = vector.extract [[SCALE_EXT]][0, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_7:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_7]][0], [[SCALE_SCALAR_7]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_7:%.+]] = vector.extract_strided_slice [[PACKED_7]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_7:%.+]] = vector.shape_cast [[OUT_SLICE_7]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_7]], [[ACC_A]] {offsets = [0, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_8:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_8:%.+]] = vector.shape_cast [[IN_SLICE_8]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_8:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_8:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_8]][0], [[SCALE_SCALAR_8]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_8:%.+]] = vector.extract_strided_slice [[PACKED_8]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_8:%.+]] = vector.shape_cast [[OUT_SLICE_8]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_8]], [[ACC_B]] {offsets = [1, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_9:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_9:%.+]] = vector.shape_cast [[IN_SLICE_9]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_9:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_9:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_9]][0], [[SCALE_SCALAR_9]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_9:%.+]] = vector.extract_strided_slice [[PACKED_9]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_9:%.+]] = vector.shape_cast [[OUT_SLICE_9]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_9]], [[ACC_A]] {offsets = [1, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_10:%.+]] = vector.shape_cast [[IN_SLICE_10]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_10]][0], [[SCALE_SCALAR_10]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_10]], [[ACC_B]] {offsets = [1, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_11:%.+]] = vector.shape_cast [[IN_SLICE_11]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_11]][0], [[SCALE_SCALAR_11]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_11]], [[ACC_A]] {offsets = [1, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_12:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_12:%.+]] = vector.shape_cast [[IN_SLICE_12]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_12:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_12:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_12]][0], [[SCALE_SCALAR_12]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_12:%.+]] = vector.extract_strided_slice [[PACKED_12]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_12:%.+]] = vector.shape_cast [[OUT_SLICE_12]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_12]], [[ACC_B]] {offsets = [1, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_13:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_13:%.+]] = vector.shape_cast [[IN_SLICE_13]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_13:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_13:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_13]][0], [[SCALE_SCALAR_13]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_13:%.+]] = vector.extract_strided_slice [[PACKED_13]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_13:%.+]] = vector.shape_cast [[OUT_SLICE_13]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_13]], [[ACC_A]] {offsets = [1, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_14:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_14:%.+]] = vector.shape_cast [[IN_SLICE_14]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_14:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_14:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_14]][0], [[SCALE_SCALAR_14]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_14:%.+]] = vector.extract_strided_slice [[PACKED_14]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_14:%.+]] = vector.shape_cast [[OUT_SLICE_14]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_14]], [[ACC_B]] {offsets = [1, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_15:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_15:%.+]] = vector.shape_cast [[IN_SLICE_15]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_15:%.+]] = vector.extract [[SCALE_EXT]][1, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_15:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_15]][0], [[SCALE_SCALAR_15]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_15:%.+]] = vector.extract_strided_slice [[PACKED_15]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_15:%.+]] = vector.shape_cast [[OUT_SLICE_15]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_15]], [[ACC_A]] {offsets = [1, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_16:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_16:%.+]] = vector.shape_cast [[IN_SLICE_16]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_16:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_16:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_16]][0], [[SCALE_SCALAR_16]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_16:%.+]] = vector.extract_strided_slice [[PACKED_16]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_16:%.+]] = vector.shape_cast [[OUT_SLICE_16]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_16]], [[ACC_B]] {offsets = [2, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_17:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_17:%.+]] = vector.shape_cast [[IN_SLICE_17]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_17:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_17:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_17]][0], [[SCALE_SCALAR_17]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_17:%.+]] = vector.extract_strided_slice [[PACKED_17]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_17:%.+]] = vector.shape_cast [[OUT_SLICE_17]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_17]], [[ACC_A]] {offsets = [2, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_18:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_18:%.+]] = vector.shape_cast [[IN_SLICE_18]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_18:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_18:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_18]][0], [[SCALE_SCALAR_18]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_18:%.+]] = vector.extract_strided_slice [[PACKED_18]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_18:%.+]] = vector.shape_cast [[OUT_SLICE_18]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_18]], [[ACC_B]] {offsets = [2, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_19:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_19:%.+]] = vector.shape_cast [[IN_SLICE_19]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_19:%.+]] = vector.extract [[SCALE_EXT]][2, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_19:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_19]][0], [[SCALE_SCALAR_19]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_19:%.+]] = vector.extract_strided_slice [[PACKED_19]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_19:%.+]] = vector.shape_cast [[OUT_SLICE_19]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_19]], [[ACC_A]] {offsets = [2, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_20:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_20:%.+]] = vector.shape_cast [[IN_SLICE_20]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_20:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_20:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_20]][0], [[SCALE_SCALAR_20]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_20:%.+]] = vector.extract_strided_slice [[PACKED_20]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_20:%.+]] = vector.shape_cast [[OUT_SLICE_20]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_20]], [[ACC_B]] {offsets = [2, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_21:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_21:%.+]] = vector.shape_cast [[IN_SLICE_21]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_21:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_21:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_21]][0], [[SCALE_SCALAR_21]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_21:%.+]] = vector.extract_strided_slice [[PACKED_21]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_21:%.+]] = vector.shape_cast [[OUT_SLICE_21]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_21]], [[ACC_A]] {offsets = [2, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_22:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_22:%.+]] = vector.shape_cast [[IN_SLICE_22]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_22:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_22:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_22]][0], [[SCALE_SCALAR_22]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_22:%.+]] = vector.extract_strided_slice [[PACKED_22]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_22:%.+]] = vector.shape_cast [[OUT_SLICE_22]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_22]], [[ACC_B]] {offsets = [2, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_23:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_23:%.+]] = vector.shape_cast [[IN_SLICE_23]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_23:%.+]] = vector.extract [[SCALE_EXT]][2, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_23:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_23]][0], [[SCALE_SCALAR_23]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_23:%.+]] = vector.extract_strided_slice [[PACKED_23]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_23:%.+]] = vector.shape_cast [[OUT_SLICE_23]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_23]], [[ACC_A]] {offsets = [2, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_24:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_24:%.+]] = vector.shape_cast [[IN_SLICE_24]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_24:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_24:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_24]][0], [[SCALE_SCALAR_24]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_24:%.+]] = vector.extract_strided_slice [[PACKED_24]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_24:%.+]] = vector.shape_cast [[OUT_SLICE_24]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_24]], [[ACC_B]] {offsets = [3, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_25:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_25:%.+]] = vector.shape_cast [[IN_SLICE_25]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_25:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_25:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_25]][0], [[SCALE_SCALAR_25]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_25:%.+]] = vector.extract_strided_slice [[PACKED_25]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_25:%.+]] = vector.shape_cast [[OUT_SLICE_25]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_25]], [[ACC_A]] {offsets = [3, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_26:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_26:%.+]] = vector.shape_cast [[IN_SLICE_26]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_26:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_26:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_26]][0], [[SCALE_SCALAR_26]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_26:%.+]] = vector.extract_strided_slice [[PACKED_26]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_26:%.+]] = vector.shape_cast [[OUT_SLICE_26]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_26]], [[ACC_B]] {offsets = [3, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_27:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_27:%.+]] = vector.shape_cast [[IN_SLICE_27]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_27:%.+]] = vector.extract [[SCALE_EXT]][3, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_27:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_27]][0], [[SCALE_SCALAR_27]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_27:%.+]] = vector.extract_strided_slice [[PACKED_27]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_27:%.+]] = vector.shape_cast [[OUT_SLICE_27]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_27]], [[ACC_A]] {offsets = [3, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_28:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_28:%.+]] = vector.shape_cast [[IN_SLICE_28]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_28:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_28:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_28]][0], [[SCALE_SCALAR_28]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_28:%.+]] = vector.extract_strided_slice [[PACKED_28]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_28:%.+]] = vector.shape_cast [[OUT_SLICE_28]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_28]], [[ACC_B]] {offsets = [3, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_29:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_29:%.+]] = vector.shape_cast [[IN_SLICE_29]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_29:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_29:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_29]][0], [[SCALE_SCALAR_29]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_29:%.+]] = vector.extract_strided_slice [[PACKED_29]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_29:%.+]] = vector.shape_cast [[OUT_SLICE_29]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_29]], [[ACC_A]] {offsets = [3, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_30:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_30:%.+]] = vector.shape_cast [[IN_SLICE_30]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_30:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_30:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_30]][0], [[SCALE_SCALAR_30]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_30:%.+]] = vector.extract_strided_slice [[PACKED_30]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_30:%.+]] = vector.shape_cast [[OUT_SLICE_30]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_30]], [[ACC_B]] {offsets = [3, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_31:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_31:%.+]] = vector.shape_cast [[IN_SLICE_31]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_31:%.+]] = vector.extract [[SCALE_EXT]][3, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_31:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_31]][0], [[SCALE_SCALAR_31]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_31:%.+]] = vector.extract_strided_slice [[PACKED_31]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_31:%.+]] = vector.shape_cast [[OUT_SLICE_31]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_31]], [[ACC_A]] {offsets = [3, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_32:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_32:%.+]] = vector.shape_cast [[IN_SLICE_32]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_32:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_32:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_32]][0], [[SCALE_SCALAR_32]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_32:%.+]] = vector.extract_strided_slice [[PACKED_32]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_32:%.+]] = vector.shape_cast [[OUT_SLICE_32]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_32]], [[ACC_B]] {offsets = [4, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_33:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_33:%.+]] = vector.shape_cast [[IN_SLICE_33]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_33:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_33:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_33]][0], [[SCALE_SCALAR_33]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_33:%.+]] = vector.extract_strided_slice [[PACKED_33]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_33:%.+]] = vector.shape_cast [[OUT_SLICE_33]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_33]], [[ACC_A]] {offsets = [4, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_34:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_34:%.+]] = vector.shape_cast [[IN_SLICE_34]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_34:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_34:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_34]][0], [[SCALE_SCALAR_34]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_34:%.+]] = vector.extract_strided_slice [[PACKED_34]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_34:%.+]] = vector.shape_cast [[OUT_SLICE_34]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_34]], [[ACC_B]] {offsets = [4, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_35:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_35:%.+]] = vector.shape_cast [[IN_SLICE_35]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_35:%.+]] = vector.extract [[SCALE_EXT]][4, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_35:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_35]][0], [[SCALE_SCALAR_35]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_35:%.+]] = vector.extract_strided_slice [[PACKED_35]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_35:%.+]] = vector.shape_cast [[OUT_SLICE_35]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_35]], [[ACC_A]] {offsets = [4, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_36:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_36:%.+]] = vector.shape_cast [[IN_SLICE_36]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_36:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_36:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_36]][0], [[SCALE_SCALAR_36]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_36:%.+]] = vector.extract_strided_slice [[PACKED_36]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_36:%.+]] = vector.shape_cast [[OUT_SLICE_36]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_36]], [[ACC_B]] {offsets = [4, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_37:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_37:%.+]] = vector.shape_cast [[IN_SLICE_37]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_37:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_37:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_37]][0], [[SCALE_SCALAR_37]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_37:%.+]] = vector.extract_strided_slice [[PACKED_37]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_37:%.+]] = vector.shape_cast [[OUT_SLICE_37]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_37]], [[ACC_A]] {offsets = [4, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_38:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_38:%.+]] = vector.shape_cast [[IN_SLICE_38]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_38:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_38:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_38]][0], [[SCALE_SCALAR_38]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_38:%.+]] = vector.extract_strided_slice [[PACKED_38]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_38:%.+]] = vector.shape_cast [[OUT_SLICE_38]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_38]], [[ACC_B]] {offsets = [4, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_39:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_39:%.+]] = vector.shape_cast [[IN_SLICE_39]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_39:%.+]] = vector.extract [[SCALE_EXT]][4, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_39:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_39]][0], [[SCALE_SCALAR_39]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_39:%.+]] = vector.extract_strided_slice [[PACKED_39]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_39:%.+]] = vector.shape_cast [[OUT_SLICE_39]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_39]], [[ACC_A]] {offsets = [4, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_40:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_40:%.+]] = vector.shape_cast [[IN_SLICE_40]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_40:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_40:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_40]][0], [[SCALE_SCALAR_40]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_40:%.+]] = vector.extract_strided_slice [[PACKED_40]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_40:%.+]] = vector.shape_cast [[OUT_SLICE_40]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_40]], [[ACC_B]] {offsets = [5, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_41:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_41:%.+]] = vector.shape_cast [[IN_SLICE_41]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_41:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_41:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_41]][0], [[SCALE_SCALAR_41]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_41:%.+]] = vector.extract_strided_slice [[PACKED_41]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_41:%.+]] = vector.shape_cast [[OUT_SLICE_41]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_41]], [[ACC_A]] {offsets = [5, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_42:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_42:%.+]] = vector.shape_cast [[IN_SLICE_42]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_42:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_42:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_42]][0], [[SCALE_SCALAR_42]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_42:%.+]] = vector.extract_strided_slice [[PACKED_42]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_42:%.+]] = vector.shape_cast [[OUT_SLICE_42]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_42]], [[ACC_B]] {offsets = [5, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_43:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_43:%.+]] = vector.shape_cast [[IN_SLICE_43]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_43:%.+]] = vector.extract [[SCALE_EXT]][5, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_43:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_43]][0], [[SCALE_SCALAR_43]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_43:%.+]] = vector.extract_strided_slice [[PACKED_43]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_43:%.+]] = vector.shape_cast [[OUT_SLICE_43]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_43]], [[ACC_A]] {offsets = [5, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_44:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_44:%.+]] = vector.shape_cast [[IN_SLICE_44]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_44:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_44:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_44]][0], [[SCALE_SCALAR_44]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_44:%.+]] = vector.extract_strided_slice [[PACKED_44]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_44:%.+]] = vector.shape_cast [[OUT_SLICE_44]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_44]], [[ACC_B]] {offsets = [5, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_45:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_45:%.+]] = vector.shape_cast [[IN_SLICE_45]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_45:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_45:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_45]][0], [[SCALE_SCALAR_45]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_45:%.+]] = vector.extract_strided_slice [[PACKED_45]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_45:%.+]] = vector.shape_cast [[OUT_SLICE_45]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_45]], [[ACC_A]] {offsets = [5, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_46:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_46:%.+]] = vector.shape_cast [[IN_SLICE_46]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_46:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_46:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_46]][0], [[SCALE_SCALAR_46]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_46:%.+]] = vector.extract_strided_slice [[PACKED_46]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_46:%.+]] = vector.shape_cast [[OUT_SLICE_46]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_46]], [[ACC_B]] {offsets = [5, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_47:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_47:%.+]] = vector.shape_cast [[IN_SLICE_47]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_47:%.+]] = vector.extract [[SCALE_EXT]][5, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_47:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_47]][0], [[SCALE_SCALAR_47]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_47:%.+]] = vector.extract_strided_slice [[PACKED_47]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_47:%.+]] = vector.shape_cast [[OUT_SLICE_47]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_47]], [[ACC_A]] {offsets = [5, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_48:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_48:%.+]] = vector.shape_cast [[IN_SLICE_48]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_48:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_48:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_48]][0], [[SCALE_SCALAR_48]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_48:%.+]] = vector.extract_strided_slice [[PACKED_48]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_48:%.+]] = vector.shape_cast [[OUT_SLICE_48]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_48]], [[ACC_B]] {offsets = [6, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_49:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_49:%.+]] = vector.shape_cast [[IN_SLICE_49]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_49:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_49:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_49]][0], [[SCALE_SCALAR_49]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_49:%.+]] = vector.extract_strided_slice [[PACKED_49]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_49:%.+]] = vector.shape_cast [[OUT_SLICE_49]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_49]], [[ACC_A]] {offsets = [6, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_50:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_50:%.+]] = vector.shape_cast [[IN_SLICE_50]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_50:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_50:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_50]][0], [[SCALE_SCALAR_50]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_50:%.+]] = vector.extract_strided_slice [[PACKED_50]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_50:%.+]] = vector.shape_cast [[OUT_SLICE_50]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_50]], [[ACC_B]] {offsets = [6, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_51:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_51:%.+]] = vector.shape_cast [[IN_SLICE_51]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_51:%.+]] = vector.extract [[SCALE_EXT]][6, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_51:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_51]][0], [[SCALE_SCALAR_51]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_51:%.+]] = vector.extract_strided_slice [[PACKED_51]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_51:%.+]] = vector.shape_cast [[OUT_SLICE_51]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_51]], [[ACC_A]] {offsets = [6, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_52:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_52:%.+]] = vector.shape_cast [[IN_SLICE_52]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_52:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_52:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_52]][0], [[SCALE_SCALAR_52]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_52:%.+]] = vector.extract_strided_slice [[PACKED_52]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_52:%.+]] = vector.shape_cast [[OUT_SLICE_52]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_52]], [[ACC_B]] {offsets = [6, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_53:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_53:%.+]] = vector.shape_cast [[IN_SLICE_53]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_53:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_53:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_53]][0], [[SCALE_SCALAR_53]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_53:%.+]] = vector.extract_strided_slice [[PACKED_53]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_53:%.+]] = vector.shape_cast [[OUT_SLICE_53]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_53]], [[ACC_A]] {offsets = [6, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_54:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_54:%.+]] = vector.shape_cast [[IN_SLICE_54]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_54:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_54:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_54]][0], [[SCALE_SCALAR_54]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_54:%.+]] = vector.extract_strided_slice [[PACKED_54]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_54:%.+]] = vector.shape_cast [[OUT_SLICE_54]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_54]], [[ACC_B]] {offsets = [6, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_55:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_55:%.+]] = vector.shape_cast [[IN_SLICE_55]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_55:%.+]] = vector.extract [[SCALE_EXT]][6, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_55:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_55]][0], [[SCALE_SCALAR_55]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_55:%.+]] = vector.extract_strided_slice [[PACKED_55]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_55:%.+]] = vector.shape_cast [[OUT_SLICE_55]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_55]], [[ACC_A]] {offsets = [6, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_56:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_56:%.+]] = vector.shape_cast [[IN_SLICE_56]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_56:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_56:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_56]][0], [[SCALE_SCALAR_56]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_56:%.+]] = vector.extract_strided_slice [[PACKED_56]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_56:%.+]] = vector.shape_cast [[OUT_SLICE_56]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_56]], [[ACC_B]] {offsets = [7, 0, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_57:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_57:%.+]] = vector.shape_cast [[IN_SLICE_57]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_57:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_57:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_57]][0], [[SCALE_SCALAR_57]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_57:%.+]] = vector.extract_strided_slice [[PACKED_57]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_57:%.+]] = vector.shape_cast [[OUT_SLICE_57]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_57]], [[ACC_A]] {offsets = [7, 0, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_58:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_58:%.+]] = vector.shape_cast [[IN_SLICE_58]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_58:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_58:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_58]][0], [[SCALE_SCALAR_58]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_58:%.+]] = vector.extract_strided_slice [[PACKED_58]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_58:%.+]] = vector.shape_cast [[OUT_SLICE_58]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_58]], [[ACC_B]] {offsets = [7, 0, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_59:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_59:%.+]] = vector.shape_cast [[IN_SLICE_59]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_59:%.+]] = vector.extract [[SCALE_EXT]][7, 0, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_59:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_59]][0], [[SCALE_SCALAR_59]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_59:%.+]] = vector.extract_strided_slice [[PACKED_59]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_59:%.+]] = vector.shape_cast [[OUT_SLICE_59]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_59]], [[ACC_A]] {offsets = [7, 0, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_60:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 0], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_60:%.+]] = vector.shape_cast [[IN_SLICE_60]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_60:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 0] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_60:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_60]][0], [[SCALE_SCALAR_60]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_60:%.+]] = vector.extract_strided_slice [[PACKED_60]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_60:%.+]] = vector.shape_cast [[OUT_SLICE_60]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_60]], [[ACC_B]] {offsets = [7, 1, 0], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_61:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 1], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_61:%.+]] = vector.shape_cast [[IN_SLICE_61]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_61:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 1] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_61:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_61]][0], [[SCALE_SCALAR_61]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_61:%.+]] = vector.extract_strided_slice [[PACKED_61]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_61:%.+]] = vector.shape_cast [[OUT_SLICE_61]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_61]], [[ACC_A]] {offsets = [7, 1, 1], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_62:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 2], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_62:%.+]] = vector.shape_cast [[IN_SLICE_62]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_62:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 2] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_62:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_62]][0], [[SCALE_SCALAR_62]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_62:%.+]] = vector.extract_strided_slice [[PACKED_62]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_62:%.+]] = vector.shape_cast [[OUT_SLICE_62]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_VEC_62]], [[ACC_B]] {offsets = [7, 1, 2], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[IN_SLICE_63:%.+]] = vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 3], sizes = [1, 1, 1], strides = [1, 1, 1]} : vector<8x2x4xf8E5M2> to vector<1x1x1xf8E5M2>
-// CHECK-NEXT:    [[IN_VEC_63:%.+]] = vector.shape_cast [[IN_SLICE_63]] : vector<1x1x1xf8E5M2> to vector<1xf8E5M2>
-// CHECK-NEXT:    [[SCALE_SCALAR_63:%.+]] = vector.extract [[SCALE_EXT]][7, 1, 3] : f32 from vector<8x2x4xf32>
-// CHECK-NEXT:    [[PACKED_63:%.+]] = amdgpu.scaled_ext_packed [[IN_VEC_63]][0], [[SCALE_SCALAR_63]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[OUT_SLICE_63:%.+]] = vector.extract_strided_slice [[PACKED_63]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
-// CHECK-NEXT:    [[OUT_VEC_63:%.+]] = vector.shape_cast [[OUT_SLICE_63]] : vector<1xf32> to vector<1x1x1xf32>
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_VEC_63]], [[ACC_A]] {offsets = [7, 1, 3], strides = [1, 1, 1]} : vector<1x1x1xf32> into vector<8x2x4xf32>
-// CHECK-NEXT:    [[FINAL_CAST:%.+]] = vector.shape_cast [[ACC_B]] : vector<8x2x4xf32> to vector<8x8xf32>
-// CHECK-NEXT:    return [[FINAL_CAST]] : vector<8x8xf32>
+// CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf32>
+// CHECK-DAG:     %[[BCAST:.+]] = vector.broadcast %arg1
+// CHECK-DAG:     %[[IN_CAST:.+]] = vector.shape_cast %arg0
+// CHECK-DAG:     %[[SCALE_CAST:.+]] = vector.shape_cast %[[BCAST]]
+// CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %[[SCALE_CAST]]
+// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 0]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 0]
+// CHECK-NEXT:    amdgpu.scaled_ext_packed
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.insert_strided_slice
+// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 1]
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 1]
+// CHECK-NEXT:    amdgpu.scaled_ext_packed
+// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.shape_cast
+// CHECK-NEXT:    vector.insert_strided_slice
+// CHECK-DAG:     vector.shape_cast
 func.func @conversion_broadcast(%in: vector<8x8xf8E5M2>, %scale: vector<8x2xf8E8M0FNU>) -> vector<8x8xf32> {
     %bc = vector.broadcast %scale : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
     %cast1 = vector.shape_cast %in : vector<8x8xf8E5M2> to vector<8x2x4xf8E5M2>
@@ -624,11 +189,11 @@ func.func @conversion_broadcast(%in: vector<8x8xf8E5M2>, %scale: vector<8x2xf8E8
 // -----
 
 // CHECK-LABEL: @conversion_scalar
-// CHECK:         [[SCALE_F32:%.+]] = arith.extf %arg1 : f8E8M0FNU to f32
-// CHECK-NEXT:    [[SPLAT_IN:%.+]] = vector.splat %arg0 : vector<1xf8E5M2>
-// CHECK-NEXT:    [[PACKED_EXT:%.+]] = amdgpu.scaled_ext_packed [[SPLAT_IN]][0], [[SCALE_F32]] : vector<1xf8E5M2> to vector<2xf32>
-// CHECK-NEXT:    [[RESULT:%.+]] = vector.extract [[PACKED_EXT]][0] : f32 from vector<2xf32>
-// CHECK-NEXT:    return [[RESULT]] : f32
+// CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf8E5M2>
+// CHECK-NEXT:    %[[PACKED_EXT:.+]] = amdgpu.scaled_ext_packed %[[SPLAT_IN]][0], %[[SCALE_F32]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[RESULT:.+]] = vector.extract %[[PACKED_EXT]][0] : f32 from vector<2xf32>
+// CHECK-NEXT:    return %[[RESULT]] : f32
 func.func @conversion_scalar(%in: f8E5M2, %scale: f8E8M0FNU) -> f32 {
     %ext = arith.scaling_extf %in, %scale : f8E5M2, f8E8M0FNU to f32
     return %ext : f32
diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
index 2dcb0c554cc6f..0fc94c5733246 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
@@ -1,37 +1,37 @@
 // RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx950" | FileCheck %s
 
 // CHECK-LABEL: @conversion_f8_fallback
-// CHECK-DAG:     [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf8E5M2>
-// CHECK-DAG:     [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
-// CHECK:         [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_00:%.+]] = vector.shape_cast [[IN_SLICE_00]]
-// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0]
-// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_00]] into undef[0], [[SCALE_SCALAR_00]]
-// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]]
-// CHECK-NEXT:    [[OUT_SCALAR_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]]
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_00]], [[CST]]
-// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_01:%.+]] = vector.shape_cast [[IN_SLICE_01]]
-// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1]
-// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_01]] into undef[0], [[SCALE_SCALAR_01]]
-// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]]
-// CHECK-NEXT:    [[OUT_SCALAR_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]]
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_01]], [[ACC_A]]
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_10:%.+]] = vector.shape_cast [[IN_SLICE_10]]
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0]
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_10]] into undef[0], [[SCALE_SCALAR_10]]
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]]
-// CHECK-NEXT:    [[OUT_SCALAR_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]]
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_10]], [[ACC_B]]
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_11:%.+]] = vector.shape_cast [[IN_SLICE_11]]
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1]
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_11]] into undef[0], [[SCALE_SCALAR_11]]
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]]
-// CHECK-NEXT:    [[OUT_SCALAR_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]]
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_11]], [[ACC_A]]
-// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf8E5M2>
+// CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf8E5M2>
+// CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK:         %[[IN_SLICE_00:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_00:.+]] = vector.shape_cast %[[IN_SLICE_00]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_00:.+]] = vector.extract %[[SCALE_EXT]][0, 0]
+// CHECK-NEXT:    %[[PACKED_00:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_00]] into undef[0], %[[SCALE_SCALAR_00]]
+// CHECK-NEXT:    %[[OUT_SLICE_00:.+]] = vector.extract_strided_slice %[[PACKED_00]]
+// CHECK-NEXT:    %[[OUT_SCALAR_00:.+]] = vector.shape_cast %[[OUT_SLICE_00]]
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_00]], %[[CST]]
+// CHECK-NEXT:    %[[IN_SLICE_01:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_01:.+]] = vector.shape_cast %[[IN_SLICE_01]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_01:.+]] = vector.extract %[[SCALE_EXT]][0, 1]
+// CHECK-NEXT:    %[[PACKED_01:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_01]] into undef[0], %[[SCALE_SCALAR_01]]
+// CHECK-NEXT:    %[[OUT_SLICE_01:.+]] = vector.extract_strided_slice %[[PACKED_01]]
+// CHECK-NEXT:    %[[OUT_SCALAR_01:.+]] = vector.shape_cast %[[OUT_SLICE_01]]
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_01]], %[[ACC_A]]
+// CHECK-NEXT:    %[[IN_SLICE_10:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_10:.+]] = vector.shape_cast %[[IN_SLICE_10]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_10:.+]] = vector.extract %[[SCALE_EXT]][1, 0]
+// CHECK-NEXT:    %[[PACKED_10:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_10]] into undef[0], %[[SCALE_SCALAR_10]]
+// CHECK-NEXT:    %[[OUT_SLICE_10:.+]] = vector.extract_strided_slice %[[PACKED_10]]
+// CHECK-NEXT:    %[[OUT_SCALAR_10:.+]] = vector.shape_cast %[[OUT_SLICE_10]]
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_10]], %[[ACC_B]]
+// CHECK-NEXT:    %[[IN_SLICE_11:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_11:.+]] = vector.shape_cast %[[IN_SLICE_11]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_11:.+]] = vector.extract %[[SCALE_EXT]][1, 1]
+// CHECK-NEXT:    %[[PACKED_11:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_11]] into undef[0], %[[SCALE_SCALAR_11]]
+// CHECK-NEXT:    %[[OUT_SLICE_11:.+]] = vector.extract_strided_slice %[[PACKED_11]]
+// CHECK-NEXT:    %[[OUT_SCALAR_11:.+]] = vector.shape_cast %[[OUT_SLICE_11]]
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_11]], %[[ACC_A]]
+// CHECK-NEXT:    return %[[ACC_B]] : vector<2x2xf8E5M2>
 func.func @conversion_f8_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf8E5M2> {
     %ext = arith.scaling_truncf %in, %scale : vector<2x2xf32>, vector<2x2xf8E8M0FNU> to vector<2x2xf8E5M2>
     return %ext : vector<2x2xf8E5M2>
@@ -40,37 +40,37 @@ func.func @conversion_f8_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M
 // -----
 
 // CHECK-LABEL: @conversion_f4_fallback
-// CHECK-DAG:     [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf4E2M1FN>
-// CHECK-DAG:     [[SCALE_EXT:%.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
-// CHECK:         [[IN_SLICE_00:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_00:%.+]] = vector.shape_cast [[IN_SLICE_00]]
-// CHECK-NEXT:    [[SCALE_SCALAR_00:%.+]] = vector.extract [[SCALE_EXT]][0, 0]
-// CHECK-NEXT:    [[PACKED_00:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_00]] into undef[0], [[SCALE_SCALAR_00]]
-// CHECK-NEXT:    [[OUT_SLICE_00:%.+]] = vector.extract_strided_slice [[PACKED_00]]
-// CHECK-NEXT:    [[OUT_SCALAR_00:%.+]] = vector.shape_cast [[OUT_SLICE_00]]
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_00]], [[CST]]
-// CHECK-NEXT:    [[IN_SLICE_01:%.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_01:%.+]] = vector.shape_cast [[IN_SLICE_01]]
-// CHECK-NEXT:    [[SCALE_SCALAR_01:%.+]] = vector.extract [[SCALE_EXT]][0, 1]
-// CHECK-NEXT:    [[PACKED_01:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_01]] into undef[0], [[SCALE_SCALAR_01]]
-// CHECK-NEXT:    [[OUT_SLICE_01:%.+]] = vector.extract_strided_slice [[PACKED_01]]
-// CHECK-NEXT:    [[OUT_SCALAR_01:%.+]] = vector.shape_cast [[OUT_SLICE_01]]
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_01]], [[ACC_A]]
-// CHECK-NEXT:    [[IN_SLICE_10:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_10:%.+]] = vector.shape_cast [[IN_SLICE_10]]
-// CHECK-NEXT:    [[SCALE_SCALAR_10:%.+]] = vector.extract [[SCALE_EXT]][1, 0]
-// CHECK-NEXT:    [[PACKED_10:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_10]] into undef[0], [[SCALE_SCALAR_10]]
-// CHECK-NEXT:    [[OUT_SLICE_10:%.+]] = vector.extract_strided_slice [[PACKED_10]]
-// CHECK-NEXT:    [[OUT_SCALAR_10:%.+]] = vector.shape_cast [[OUT_SLICE_10]]
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_10]], [[ACC_B]]
-// CHECK-NEXT:    [[IN_SLICE_11:%.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]}
-// CHECK-NEXT:    [[IN_SCALAR_11:%.+]] = vector.shape_cast [[IN_SLICE_11]]
-// CHECK-NEXT:    [[SCALE_SCALAR_11:%.+]] = vector.extract [[SCALE_EXT]][1, 1]
-// CHECK-NEXT:    [[PACKED_11:%.+]] = amdgpu.packed_scaled_trunc [[IN_SCALAR_11]] into undef[0], [[SCALE_SCALAR_11]]
-// CHECK-NEXT:    [[OUT_SLICE_11:%.+]] = vector.extract_strided_slice [[PACKED_11]]
-// CHECK-NEXT:    [[OUT_SCALAR_11:%.+]] = vector.shape_cast [[OUT_SLICE_11]]
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice [[OUT_SCALAR_11]], [[ACC_A]]
-// CHECK-NEXT:    return [[ACC_B]] : vector<2x2xf4E2M1FN>
+// CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2x2xf4E2M1FN>
+// CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %arg1 : vector<2x2xf8E8M0FNU> to vector<2x2xf32>
+// CHECK:         %[[IN_SLICE_00:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_00:.+]] = vector.shape_cast %[[IN_SLICE_00]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_00:.+]] = vector.extract %[[SCALE_EXT]][0, 0]
+// CHECK-NEXT:    %[[PACKED_00:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_00]] into undef[0], %[[SCALE_SCALAR_00]]
+// CHECK-NEXT:    %[[OUT_SLICE_00:.+]] = vector.extract_strided_slice %[[PACKED_00]]
+// CHECK-NEXT:    %[[OUT_SCALAR_00:.+]] = vector.shape_cast %[[OUT_SLICE_00]]
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_00]], %[[CST]]
+// CHECK-NEXT:    %[[IN_SLICE_01:.+]] = vector.extract_strided_slice %arg0 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_01:.+]] = vector.shape_cast %[[IN_SLICE_01]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_01:.+]] = vector.extract %[[SCALE_EXT]][0, 1]
+// CHECK-NEXT:    %[[PACKED_01:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_01]] into undef[0], %[[SCALE_SCALAR_01]]
+// CHECK-NEXT:    %[[OUT_SLICE_01:.+]] = vector.extract_strided_slice %[[PACKED_01]]
+// CHECK-NEXT:    %[[OUT_SCALAR_01:.+]] = vector.shape_cast %[[OUT_SLICE_01]]
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_01]], %[[ACC_A]]
+// CHECK-NEXT:    %[[IN_SLICE_10:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_10:.+]] = vector.shape_cast %[[IN_SLICE_10]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_10:.+]] = vector.extract %[[SCALE_EXT]][1, 0]
+// CHECK-NEXT:    %[[PACKED_10:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_10]] into undef[0], %[[SCALE_SCALAR_10]]
+// CHECK-NEXT:    %[[OUT_SLICE_10:.+]] = vector.extract_strided_slice %[[PACKED_10]]
+// CHECK-NEXT:    %[[OUT_SCALAR_10:.+]] = vector.shape_cast %[[OUT_SLICE_10]]
+// CHECK-NEXT:    %[[ACC_A:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_10]], %[[ACC_B]]
+// CHECK-NEXT:    %[[IN_SLICE_11:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 1], strides = [1, 1]}
+// CHECK-NEXT:    %[[IN_SCALAR_11:.+]] = vector.shape_cast %[[IN_SLICE_11]]
+// CHECK-NEXT:    %[[SCALE_SCALAR_11:.+]] = vector.extract %[[SCALE_EXT]][1, 1]
+// CHECK-NEXT:    %[[PACKED_11:.+]] = amdgpu.packed_scaled_trunc %[[IN_SCALAR_11]] into undef[0], %[[SCALE_SCALAR_11]]
+// CHECK-NEXT:    %[[OUT_SLICE_11:.+]] = vector.extract_strided_slice %[[PACKED_11]]
+// CHECK-NEXT:    %[[OUT_SCALAR_11:.+]] = vector.shape_cast %[[OUT_SLICE_11]]
+// CHECK-NEXT:    %[[ACC_B:.+]] = vector.insert_strided_slice %[[OUT_SCALAR_11]], %[[ACC_A]]
+// CHECK-NEXT:    return %[[ACC_B]] : vector<2x2xf4E2M1FN>
 func.func @conversion_f4_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M0FNU>) -> vector<2x2xf4E2M1FN> {
     %ext = arith.scaling_truncf %in, %scale : vector<2x2xf32>, vector<2x2xf8E8M0FNU> to vector<2x2xf4E2M1FN>
     return %ext : vector<2x2xf4E2M1FN>
@@ -79,461 +79,25 @@ func.func @conversion_f4_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M
 // -----
 
 // CHECK-LABEL: @conversion_broadcast
-// CHECK-DAG:     [[CST:%.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf8E5M2>
-// CHECK-DAG:     [[BCAST:%.+]] = vector.broadcast %arg1
-// CHECK-DAG:     [[IN_CAST:%.+]] = vector.shape_cast %arg0 : vector<8x8xf32> to vector<8x2x4xf32>
-// CHECK-DAG:     [[SCALE_CAST:%.+]] = vector.shape_cast [[BCAST]]
-// CHECK-DAG:     [[SCALE_EXT:%.+]] = arith.extf [[SCALE_CAST]] : vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 0]
+// CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf8E5M2>
+// CHECK-DAG:     %[[BCAST:.+]] = vector.broadcast %arg1
+// CHECK-DAG:     %[[IN_CAST:.+]] = vector.shape_cast %arg0 : vector<8x8xf32> to vector<8x2x4xf32>
+// CHECK-DAG:     %[[SCALE_CAST:.+]] = vector.shape_cast %[[BCAST]]
+// CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %[[SCALE_CAST]] : vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
+// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 0]
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 0]
+// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 0]
 // CHECK-NEXT:    amdgpu.packed_scaled_trunc
 // CHECK-NEXT:    vector.extract_strided_slice
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[CST]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 1]
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %[[CST]]
+// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 1]
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 1]
+// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 1]
 // CHECK-NEXT:    amdgpu.packed_scaled_trunc
 // CHECK-NEXT:    vector.extract_strided_slice
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [0, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][0, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [1, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][1, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [2, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][2, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [3, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][3, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [4, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][4, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [5, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][5, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [6, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][6, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 0, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 0, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 0]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 0]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 1]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 1]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 2]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 2]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_A:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_B]]
-// CHECK-NEXT:    vector.extract_strided_slice [[IN_CAST]] {offsets = [7, 1, 3]
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract [[SCALE_EXT]][7, 1, 3]
-// CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
-// CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    [[ACC_B:%.+]] = vector.insert_strided_slice %{{.*}}, [[ACC_A]]
-// CHECK-NEXT:    [[FINAL_CAST:%.+]] = vector.shape_cast [[ACC_B]]
-// CHECK-NEXT:    return [[FINAL_CAST]] : vector<8x8xf8E5M2>
+// CHECK-NEXT:    vector.insert_strided_slice
 func.func @conversion_broadcast(%in: vector<8x8xf32>, %scale: vector<8x2xf8E8M0FNU>) -> vector<8x8xf8E5M2> {
     %bc = vector.broadcast %scale : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
     %cast1 = vector.shape_cast %in : vector<8x8xf32> to vector<8x2x4xf32>
@@ -546,11 +110,11 @@ func.func @conversion_broadcast(%in: vector<8x8xf32>, %scale: vector<8x2xf8E8M0F
 // -----
 
 // CHECK-LABEL: @conversion_scalar
-// CHECK:         [[SCALE_F32:%.+]] = arith.extf %arg1 : f8E8M0FNU to f32
-// CHECK-NEXT:    [[SPLAT_IN:%.+]] = vector.splat %arg0 : vector<1xf32>
-// CHECK-NEXT:    [[PACKED_TRUNC:%.+]] = amdgpu.packed_scaled_trunc [[SPLAT_IN]] into undef[0], [[SCALE_F32]]
-// CHECK-NEXT:    [[RESULT:%.+]] = vector.extract [[PACKED_TRUNC]][0]
-// CHECK-NEXT:    return [[RESULT]] : f8E5M2
+// CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf32>
+// CHECK-NEXT:    %[[PACKED_TRUNC:.+]] = amdgpu.packed_scaled_trunc %[[SPLAT_IN]] into undef[0], %[[SCALE_F32]]
+// CHECK-NEXT:    %[[RESULT:.+]] = vector.extract %[[PACKED_TRUNC]][0]
+// CHECK-NEXT:    return %[[RESULT]] : f8E5M2
 func.func @conversion_scalar(%in: f32, %scale: f8E8M0FNU) -> f8E5M2 {
     %ext = arith.scaling_truncf %in, %scale : f32, f8E8M0FNU to f8E5M2
     return %ext : f8E5M2

>From a3dcac3a64a993b22896b15f0b15859b66a4418c Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 12:04:31 +0000
Subject: [PATCH 10/17] cleanup originalScaleShape

---
 .../lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 0955b0c3e7752..d87c76a8bff18 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -488,19 +488,17 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
   }
 
   VectorType inVecType = cast<VectorType>(in.getType());
-  Value origScale = getOriginalVectorValue(scale);
+  Value origScale = getOriginalVectorValue(op.getScale());
 
-  int64_t scalarShape[1] = {1};
   ArrayRef<int64_t> inShape = inVecType.getShape();
-  ArrayRef<int64_t> originalScaleShape = {scalarShape};
+  SmallVector<int64_t> originalScaleShape;
   if (auto origScaleVecType = dyn_cast<VectorType>(origScale.getType()))
-    originalScaleShape = origScaleVecType.getShape();
+    llvm::append_range(originalScaleShape, origScaleVecType.getShape());
 
-  SmallVector<int64_t> paddedScaleShape(originalScaleShape);
-  paddedScaleShape.insert(paddedScaleShape.end(),
-                          inShape.size() - originalScaleShape.size(), 1);
+  originalScaleShape.insert(originalScaleShape.end(),
+                            inShape.size() - originalScaleShape.size(), 1);
 
-  auto maybeRatio = computeShapeRatio(inShape, paddedScaleShape);
+  auto maybeRatio = computeShapeRatio(inShape, originalScaleShape);
   assert(maybeRatio &&
          "failed to derive block size from broadcast or splat operation");
 

>From fa5987105a8c148e3e60aa16e3113710c2a4d50f Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 12:04:58 +0000
Subject: [PATCH 11/17] fix tests

---
 .../ArithToAMDGPU/scaling-extf.mlir           | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
index 40c7e3ef6b267..7751701b265f1 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
@@ -162,21 +162,28 @@ func.func @conversion_f4_f16_fallback(%in: vector<2x2xf4E2M1FN>, %scale: vector<
 // CHECK-DAG:     %[[IN_CAST:.+]] = vector.shape_cast %arg0
 // CHECK-DAG:     %[[SCALE_CAST:.+]] = vector.shape_cast %[[BCAST]]
 // CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %[[SCALE_CAST]]
-// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 0]
+// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]}
 // CHECK-NEXT:    vector.shape_cast
 // CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 0]
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
 // CHECK-NEXT:    amdgpu.scaled_ext_packed
-// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [0], strides = [1]}
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [2], sizes = [2], strides = [1]}
+// CHECK-NEXT:    amdgpu.scaled_ext_packed
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [2], strides = [1]}
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.insert_strided_slice
-// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 1]
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}} {offsets = [0, 0, 0], strides = [1, 1, 1]}
+// CHECK-NEXT:    vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]}
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 1]
+// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 1, 0]
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
+// CHECK-NEXT:    amdgpu.scaled_ext_packed
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [0], strides = [1]}
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [2], sizes = [2], strides = [1]}
 // CHECK-NEXT:    amdgpu.scaled_ext_packed
-// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [2], strides = [1]}
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.insert_strided_slice
-// CHECK-DAG:     vector.shape_cast
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [0, 1, 0], strides = [1, 1, 1]} 
 func.func @conversion_broadcast(%in: vector<8x8xf8E5M2>, %scale: vector<8x2xf8E8M0FNU>) -> vector<8x8xf32> {
     %bc = vector.broadcast %scale : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
     %cast1 = vector.shape_cast %in : vector<8x8xf8E5M2> to vector<8x2x4xf8E5M2>

>From 34e6dd9ba15723b2459f5f22a49bafe6bf574fd1 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 12:40:00 +0000
Subject: [PATCH 12/17] cleanup originalScaleShape

---
 .../Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp  | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index d87c76a8bff18..66c0c90ee792b 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -597,19 +597,17 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
   }
 
   VectorType inVecType = cast<VectorType>(in.getType());
-  Value origScale = getOriginalVectorValue(scale);
+  Value origScale = getOriginalVectorValue(op.getScale());
 
-  int64_t scalarShape[1] = {1};
   ArrayRef<int64_t> inShape = inVecType.getShape();
-  ArrayRef<int64_t> originalScaleShape = {scalarShape};
+  SmallVector<int64_t> originalScaleShape;
   if (auto origScaleVecType = dyn_cast<VectorType>(origScale.getType()))
-    originalScaleShape = origScaleVecType.getShape();
+    llvm::append_range(originalScaleShape, origScaleVecType.getShape());
 
-  SmallVector<int64_t> paddedScaleShape(originalScaleShape);
-  paddedScaleShape.insert(paddedScaleShape.end(),
-                          inShape.size() - originalScaleShape.size(), 1);
+  originalScaleShape.insert(originalScaleShape.end(),
+                            inShape.size() - originalScaleShape.size(), 1);
 
-  auto maybeRatio = computeShapeRatio(inShape, paddedScaleShape);
+  auto maybeRatio = computeShapeRatio(inShape, originalScaleShape);
   assert(maybeRatio &&
          "failed to derive block size from broadcast or splat operation");
 
@@ -643,7 +641,8 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
       Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
           loc, truncScaleResultType, slice, uniformScale, 0,
           /*existing=*/nullptr);
-      if (sliceWidth != opWidth)
+      int64_t packedWidth = cast<VectorType>(scaleTrunc.getType()).getNumElements();
+      if (packedWidth != opWidth)
         scaleTrunc = rewriter.create<vector::ExtractStridedSliceOp>(
             loc, scaleTrunc, 0, sliceWidth, 1);
       blockResult = rewriter.create<vector::InsertStridedSliceOp>(

>From e36a8b238d7ad28721e1bc4de9d5f23dfcb8ccf0 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 12:40:19 +0000
Subject: [PATCH 13/17] fix tests

---
 .../ArithToAMDGPU/scaling-truncf.mlir         | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
index 0fc94c5733246..129e8c5a6fa54 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
@@ -81,23 +81,35 @@ func.func @conversion_f4_fallback(%in: vector<2x2xf32>, %scale: vector<2x2xf8E8M
 // CHECK-LABEL: @conversion_broadcast
 // CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x2x4xf8E5M2>
 // CHECK-DAG:     %[[BCAST:.+]] = vector.broadcast %arg1
-// CHECK-DAG:     %[[IN_CAST:.+]] = vector.shape_cast %arg0 : vector<8x8xf32> to vector<8x2x4xf32>
+// CHECK-DAG:     %[[IN_CAST:.+]] = vector.shape_cast %arg0
 // CHECK-DAG:     %[[SCALE_CAST:.+]] = vector.shape_cast %[[BCAST]]
-// CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %[[SCALE_CAST]] : vector<8x2x4xf8E8M0FNU> to vector<8x2x4xf32>
-// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 0]
+// CHECK-DAG:     %[[SCALE_EXT:.+]] = arith.extf %[[SCALE_CAST]]
+// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]}
 // CHECK-NEXT:    vector.shape_cast
 // CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 0]
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [0], strides = [1]}
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [2], sizes = [2], strides = [1]}
 // CHECK-NEXT:    amdgpu.packed_scaled_trunc
 // CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [2], strides = [1]}
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %[[CST]]
-// CHECK-DAG:     vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 0, 1]
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}} {offsets = [0, 0, 0], strides = [1, 1, 1]}
+// CHECK-NEXT:    vector.extract_strided_slice %[[IN_CAST]] {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]}
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 0, 1]
+// CHECK-NEXT:    vector.extract %[[SCALE_EXT]][0, 1, 0]
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
 // CHECK-NEXT:    amdgpu.packed_scaled_trunc
-// CHECK-NEXT:    vector.extract_strided_slice
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [0], strides = [1]}
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [2], sizes = [2], strides = [1]}
+// CHECK-NEXT:    amdgpu.packed_scaled_trunc
+// CHECK-NEXT:    vector.extract_strided_slice %{{.+}} {offsets = [0], sizes = [2], strides = [1]}
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [2], strides = [1]}
 // CHECK-NEXT:    vector.shape_cast
-// CHECK-NEXT:    vector.insert_strided_slice
+// CHECK-NEXT:    vector.insert_strided_slice %{{.+}}, %{{.+}} {offsets = [0, 1, 0], strides = [1, 1, 1]} 
 func.func @conversion_broadcast(%in: vector<8x8xf32>, %scale: vector<8x2xf8E8M0FNU>) -> vector<8x8xf8E5M2> {
     %bc = vector.broadcast %scale : vector<8x2xf8E8M0FNU> to vector<4x8x2xf8E8M0FNU>
     %cast1 = vector.shape_cast %in : vector<8x8xf32> to vector<8x2x4xf32>

>From 9ccf6a408eb04ebf43245412b098275357c7f6b2 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 12:48:43 +0000
Subject: [PATCH 14/17] fix loop

---
 mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 66c0c90ee792b..0601fd42630fe 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -525,9 +525,9 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
     Value blockResult =
         rewriter.createOrFold<vector::SplatOp>(loc, blockResultType, zero);
 
-    for (int64_t i = 0, sliceWidth = opWidth - blockSize % opWidth;
+    for (int64_t i = 0, sliceWidth = std::min(opWidth, blockSize - i);
          i < blockSize;
-         i += sliceWidth, sliceWidth = opWidth - blockSize % opWidth) {
+         i += sliceWidth, sliceWidth = std::min(opWidth, blockSize - i)) {
       Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
           loc, block1D, i, sliceWidth, 1);
       // TODO: replace this with non-packed ScaledExtOp for sliceWidth == 1
@@ -632,9 +632,9 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
     Value blockResult =
         rewriter.createOrFold<vector::SplatOp>(loc, blockResultType, zero);
 
-    for (int64_t i = 0, sliceWidth = opWidth - blockSize % opWidth;
+    for (int64_t i = 0, sliceWidth = std::min(opWidth, blockSize - i);
          i < blockSize;
-         i += sliceWidth, sliceWidth = opWidth - blockSize % opWidth) {
+         i += sliceWidth, sliceWidth = std::min(opWidth, blockSize - i)) {
       Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
           loc, block1D, i, sliceWidth, 1);
       // TODO: replace this with non-packed ScaledTruncOp for sliceWidth == 1

>From aec259cf88169db7d5b4a985300ae1ec6fa11102 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 13:00:07 +0000
Subject: [PATCH 15/17] update tests

---
 .../ArithToAMDGPU/scaling-extf.mlir           | 36 ++++++++++++++++++
 .../ArithToAMDGPU/scaling-truncf.mlir         | 38 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
index 7751701b265f1..a7eeab0b84d11 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
@@ -195,6 +195,42 @@ func.func @conversion_broadcast(%in: vector<8x8xf8E5M2>, %scale: vector<8x2xf8E8
 
 // -----
 
+// CHECK-LABEL: @conversion_broadcast_odd
+// CHECK-NEXT:    %[[CST_PARTIAL:.+]] = arith.constant dense<0.000000e+00> : vector<3xf32>
+// CHECK-NEXT:    %[[CST_FINAL:.+]] = arith.constant dense<0.000000e+00> : vector<6xf32>
+// CHECK-NEXT:    %[[SCALE_BC:.+]] = vector.broadcast %arg1 : vector<2xf8E8M0FNU> to vector<3x2xf8E8M0FNU>
+// CHECK-NEXT:    %[[SCALE_FLAT:.+]] = vector.shape_cast %[[SCALE_BC]] : vector<3x2xf8E8M0FNU> to vector<6xf8E8M0FNU>
+// CHECK-NEXT:    %[[SCALE_EXT:.+]] = arith.extf %[[SCALE_FLAT]] : vector<6xf8E8M0FNU> to vector<6xf32>
+// CHECK-NEXT:    %[[IN_SLICE_0:.+]] = vector.extract_strided_slice %arg0 {offsets = [0], sizes = [3], strides = [1]} : vector<6xf8E5M2> to vector<3xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_0:.+]] = vector.extract %[[SCALE_EXT]][0] : f32 from vector<6xf32>
+// CHECK-NEXT:    %[[IN_CHUNK_0A:.+]] = vector.extract_strided_slice %[[IN_SLICE_0]] {offsets = [0], sizes = [2], strides = [1]} : vector<3xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[PACKED_0A:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK_0A]][0], %[[SCALE_SCALAR_0]] : vector<2xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[PARTIAL_ACC_0:.+]] = vector.insert_strided_slice %[[PACKED_0A]], %[[CST_PARTIAL]] {offsets = [0], strides = [1]} : vector<2xf32> into vector<3xf32>
+// CHECK-NEXT:    %[[IN_CHUNK_0B:.+]] = vector.extract_strided_slice %[[IN_SLICE_0]] {offsets = [2], sizes = [1], strides = [1]} : vector<3xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[PACKED_0B_RAW:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK_0B]][0], %[[SCALE_SCALAR_0]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[PACKED_0B:.+]] = vector.extract_strided_slice %[[PACKED_0B_RAW]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_0:.+]] = vector.insert_strided_slice %[[PACKED_0B]], %[[PARTIAL_ACC_0]] {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
+// CHECK-NEXT:    %[[FINAL_ACC_A:.+]] = vector.insert_strided_slice %[[OUT_SLICE_0]], %[[CST_FINAL]] {offsets = [0], strides = [1]} : vector<3xf32> into vector<6xf32>
+// CHECK-NEXT:    %[[IN_SLICE_1:.+]] = vector.extract_strided_slice %arg0 {offsets = [3], sizes = [3], strides = [1]} : vector<6xf8E5M2> to vector<3xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_SCALAR_1:.+]] = vector.extract %[[SCALE_EXT]][3] : f32 from vector<6xf32>
+// CHECK-NEXT:    %[[IN_CHUNK_1A:.+]] = vector.extract_strided_slice %[[IN_SLICE_1]] {offsets = [0], sizes = [2], strides = [1]} : vector<3xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[PACKED_1A:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK_1A]][0], %[[SCALE_SCALAR_1]] : vector<2xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[PARTIAL_ACC_1:.+]] = vector.insert_strided_slice %[[PACKED_1A]], %[[CST_PARTIAL]] {offsets = [0], strides = [1]} : vector<2xf32> into vector<3xf32>
+// CHECK-NEXT:    %[[IN_CHUNK_1B:.+]] = vector.extract_strided_slice %[[IN_SLICE_1]] {offsets = [2], sizes = [1], strides = [1]} : vector<3xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[PACKED_1B_RAW:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK_1B]][0], %[[SCALE_SCALAR_1]] : vector<1xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[PACKED_1B:.+]] = vector.extract_strided_slice %[[PACKED_1B_RAW]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[OUT_SLICE_1:.+]] = vector.insert_strided_slice %[[PACKED_1B]], %[[PARTIAL_ACC_1]] {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
+// CHECK-NEXT:    %[[RESULT:.+]] = vector.insert_strided_slice %[[OUT_SLICE_1]], %[[FINAL_ACC_A]] {offsets = [3], strides = [1]} : vector<3xf32> into vector<6xf32>
+// CHECK-NEXT:    return %[[RESULT]] : vector<6xf32>
+func.func @conversion_broadcast_odd(%in: vector<6xf8E5M2>, %scale: vector<2xf8E8M0FNU>) -> vector<6xf32> {
+    %bc = vector.broadcast %scale : vector<2xf8E8M0FNU> to vector<3x2xf8E8M0FNU>
+    %cast = vector.shape_cast %bc : vector<3x2xf8E8M0FNU> to vector<6xf8E8M0FNU>
+    %ext = arith.scaling_extf %in, %cast : vector<6xf8E5M2>, vector<6xf8E8M0FNU> to vector<6xf32>
+    return %ext : vector<6xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @conversion_scalar
 // CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
 // CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf8E5M2>
diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
index 129e8c5a6fa54..b60b1f372fd10 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
@@ -121,6 +121,44 @@ func.func @conversion_broadcast(%in: vector<8x8xf32>, %scale: vector<8x2xf8E8M0F
 
 // -----
 
+// CHECK-LABEL: @conversion_broadcast_odd
+// CHECK-NEXT:    %[[CST3:.+]] = arith.constant dense<0.000000e+00> : vector<3xf8E5M2>
+// CHECK-NEXT:    %[[CST6:.+]] = arith.constant dense<0.000000e+00> : vector<6xf8E5M2>
+// CHECK-NEXT:    %[[SCALE_BCAST:.+]] = vector.broadcast %arg1 : vector<2xf8E8M0FNU> to vector<3x2xf8E8M0FNU>
+// CHECK-NEXT:    %[[SCALE_FLAT:.+]] = vector.shape_cast %[[SCALE_BCAST]] : vector<3x2xf8E8M0FNU> to vector<6xf8E8M0FNU>
+// CHECK-NEXT:    %[[SCALE_EXTF:.+]] = arith.extf %[[SCALE_FLAT]] : vector<6xf8E8M0FNU> to vector<6xf32>
+// CHECK-NEXT:    %[[IN_CHUNK0:.+]] = vector.extract_strided_slice %arg0 {offsets = [0], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
+// CHECK-NEXT:    %[[SCALE0:.+]] = vector.extract %[[SCALE_EXTF]][0] : f32 from vector<6xf32>
+// CHECK-NEXT:    %[[IN_CHUNK0_PART0:.+]] = vector.extract_strided_slice %[[IN_CHUNK0]] {offsets = [0], sizes = [2], strides = [1]} : vector<3xf32> to vector<2xf32>
+// CHECK-NEXT:    %[[PACKED0_PART0:.+]] = amdgpu.packed_scaled_trunc %[[IN_CHUNK0_PART0]] into undef[0], %[[SCALE0]] : vector<2xf32> to vector<4xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK0_PART0:.+]] = vector.extract_strided_slice %[[PACKED0_PART0]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[ACCUM0_PART0:.+]] = vector.insert_strided_slice %[[OUT_CHUNK0_PART0]], %[[CST3]] {offsets = [0], strides = [1]} : vector<2xf8E5M2> into vector<3xf8E5M2>
+// CHECK-NEXT:    %[[IN_CHUNK0_PART1:.+]] = vector.extract_strided_slice %[[IN_CHUNK0]] {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[PACKED0_PART1:.+]] = amdgpu.packed_scaled_trunc %[[IN_CHUNK0_PART1]] into undef[0], %[[SCALE0]] : vector<1xf32> to vector<4xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK0_PART1:.+]] = vector.extract_strided_slice %[[PACKED0_PART1]] {offsets = [0], sizes = [1], strides = [1]} : vector<4xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[CHUNK0_RES:.+]] = vector.insert_strided_slice %[[OUT_CHUNK0_PART1]], %[[ACCUM0_PART0]] {offsets = [2], strides = [1]} : vector<1xf8E5M2> into vector<3xf8E5M2>
+// CHECK-NEXT:    %[[FINAL_ACCUM_A:.+]] = vector.insert_strided_slice %[[CHUNK0_RES]], %[[CST6]] {offsets = [0], strides = [1]} : vector<3xf8E5M2> into vector<6xf8E5M2>
+// CHECK-NEXT:    %[[IN_CHUNK1:.+]] = vector.extract_strided_slice %arg0 {offsets = [3], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
+// CHECK-NEXT:    %[[SCALE1:.+]] = vector.extract %[[SCALE_EXTF]][3] : f32 from vector<6xf32>
+// CHECK-NEXT:    %[[IN_CHUNK1_PART0:.+]] = vector.extract_strided_slice %[[IN_CHUNK1]] {offsets = [0], sizes = [2], strides = [1]} : vector<3xf32> to vector<2xf32>
+// CHECK-NEXT:    %[[PACKED1_PART0:.+]] = amdgpu.packed_scaled_trunc %[[IN_CHUNK1_PART0]] into undef[0], %[[SCALE1]] : vector<2xf32> to vector<4xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK1_PART0:.+]] = vector.extract_strided_slice %[[PACKED1_PART0]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[ACCUM1_PART0:.+]] = vector.insert_strided_slice %[[OUT_CHUNK1_PART0]], %[[CST3]] {offsets = [0], strides = [1]} : vector<2xf8E5M2> into vector<3xf8E5M2>
+// CHECK-NEXT:    %[[IN_CHUNK1_PART1:.+]] = vector.extract_strided_slice %[[IN_CHUNK1]] {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
+// CHECK-NEXT:    %[[PACKED1_PART1:.+]] = amdgpu.packed_scaled_trunc %[[IN_CHUNK1_PART1]] into undef[0], %[[SCALE1]] : vector<1xf32> to vector<4xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK1_PART1:.+]] = vector.extract_strided_slice %[[PACKED1_PART1]] {offsets = [0], sizes = [1], strides = [1]} : vector<4xf8E5M2> to vector<1xf8E5M2>
+// CHECK-NEXT:    %[[CHUNK1_RES:.+]] = vector.insert_strided_slice %[[OUT_CHUNK1_PART1]], %[[ACCUM1_PART0]] {offsets = [2], strides = [1]} : vector<1xf8E5M2> into vector<3xf8E5M2>
+// CHECK-NEXT:    %[[FINAL_RESULT:.+]] = vector.insert_strided_slice %[[CHUNK1_RES]], %[[FINAL_ACCUM_A]] {offsets = [3], strides = [1]} : vector<3xf8E5M2> into vector<6xf8E5M2>
+// CHECK-NEXT:    return %[[FINAL_RESULT]] : vector<6xf8E5M2>
+func.func @conversion_broadcast_odd(%in: vector<6xf32>, %scale: vector<2xf8E8M0FNU>) -> vector<6xf8E5M2> {
+    %bc = vector.broadcast %scale : vector<2xf8E8M0FNU> to vector<3x2xf8E8M0FNU>
+    %cast = vector.shape_cast %bc : vector<3x2xf8E8M0FNU> to vector<6xf8E8M0FNU>
+    %ext = arith.scaling_truncf %in, %cast : vector<6xf32>, vector<6xf8E8M0FNU> to vector<6xf8E5M2>
+    return %ext : vector<6xf8E5M2>
+}
+
+// -----
+
 // CHECK-LABEL: @conversion_scalar
 // CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
 // CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf32>

>From b76a3e8654f436a9cbd072f980ebf9849df20f28 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 13:10:35 +0000
Subject: [PATCH 16/17] more tests

---
 .../ArithToAMDGPU/scaling-extf.mlir           | 19 ++++++++++++++++
 .../ArithToAMDGPU/scaling-truncf.mlir         | 22 +++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
index a7eeab0b84d11..095f3e575eca8 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
@@ -229,6 +229,25 @@ func.func @conversion_broadcast_odd(%in: vector<6xf8E5M2>, %scale: vector<2xf8E8
     return %ext : vector<6xf32>
 }
 
+// -----
+// CHECK-LABEL: @conversion_splat
+// CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32>
+// CHECK-DAG:     %[[SCALE_SPLAT:.+]] = vector.splat %arg1 : vector<4xf8E8M0FNU>
+// CHECK-DAG:     %[[SCALE_EXTF:.+]] = arith.extf %[[SCALE_SPLAT]] : vector<4xf8E8M0FNU> to vector<4xf32>
+// CHECK-DAG:     %[[SCALE_SCALAR:.+]] = vector.extract %[[SCALE_EXTF]][0] : f32 from vector<4xf32>
+// CHECK:         %[[IN_CHUNK0:.+]] = vector.extract_strided_slice %arg0 {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK0:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK0]][0], %[[SCALE_SCALAR]] : vector<2xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[ACCUM_A:.+]] = vector.insert_strided_slice %[[OUT_CHUNK0]], %[[CST]] {offsets = [0], strides = [1]} : vector<2xf32> into vector<4xf32>
+// CHECK-NEXT:    %[[IN_CHUNK1:.+]] = vector.extract_strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK1:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK1]][0], %[[SCALE_SCALAR]] : vector<2xf8E5M2> to vector<2xf32>
+// CHECK-NEXT:    %[[FINAL_RESULT:.+]] = vector.insert_strided_slice %[[OUT_CHUNK1]], %[[ACCUM_A]] {offsets = [2], strides = [1]} : vector<2xf32> into vector<4xf32>
+// CHECK-NEXT:    return %[[FINAL_RESULT]] : vector<4xf32>
+func.func @conversion_splat(%in: vector<4xf8E5M2>, %scale: f8E8M0FNU) -> vector<4xf32> {
+    %splat = vector.splat %scale : vector<4xf8E8M0FNU>
+    %ext = arith.scaling_extf %in, %splat : vector<4xf8E5M2>, vector<4xf8E8M0FNU> to vector<4xf32>
+    return %ext : vector<4xf32>
+}
+
 // -----
 
 // CHECK-LABEL: @conversion_scalar
diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
index b60b1f372fd10..0519050c5ecc4 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
@@ -159,6 +159,28 @@ func.func @conversion_broadcast_odd(%in: vector<6xf32>, %scale: vector<2xf8E8M0F
 
 // -----
 
+// CHECK-LABEL: @conversion_splat
+// CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<4xf8E5M2>
+// CHECK-DAG:     %[[SCALE_SPLAT:.+]] = vector.splat %arg1 : vector<4xf8E8M0FNU>
+// CHECK-DAG:     %[[SCALE_EXTF:.+]] = arith.extf %[[SCALE_SPLAT]] : vector<4xf8E8M0FNU> to vector<4xf32>
+// CHECK-DAG:     %[[SCALE_SCALAR:.+]] = vector.extract %[[SCALE_EXTF]][0] : f32 from vector<4xf32>
+// CHECK:         %[[IN_CHUNK0:.+]] = vector.extract_strided_slice %arg0 {offsets = [0], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+// CHECK-NEXT:    %[[PACKED0:.+]] = amdgpu.packed_scaled_trunc %[[IN_CHUNK0]] into undef[0], %[[SCALE_SCALAR]] : vector<2xf32> to vector<4xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK0:.+]] = vector.extract_strided_slice %[[PACKED0]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[ACCUM_A:.+]] = vector.insert_strided_slice %[[OUT_CHUNK0]], %[[CST]] {offsets = [0], strides = [1]} : vector<2xf8E5M2> into vector<4xf8E5M2>
+// CHECK-NEXT:    %[[IN_CHUNK1:.+]] = vector.extract_strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+// CHECK-NEXT:    %[[PACKED1:.+]] = amdgpu.packed_scaled_trunc %[[IN_CHUNK1]] into undef[0], %[[SCALE_SCALAR]] : vector<2xf32> to vector<4xf8E5M2>
+// CHECK-NEXT:    %[[OUT_CHUNK1:.+]] = vector.extract_strided_slice %[[PACKED1]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
+// CHECK-NEXT:    %[[FINAL_RESULT:.+]] = vector.insert_strided_slice %[[OUT_CHUNK1]], %[[ACCUM_A]] {offsets = [2], strides = [1]} : vector<2xf8E5M2> into vector<4xf8E5M2>
+// CHECK-NEXT:    return %[[FINAL_RESULT]] : vector<4xf8E5M2>
+func.func @conversion_splat(%in: vector<4xf32>, %scale: f8E8M0FNU) -> vector<4xf8E5M2> {
+    %splat = vector.splat %scale : vector<4xf8E8M0FNU>
+    %ext = arith.scaling_truncf %in, %splat : vector<4xf32>, vector<4xf8E8M0FNU> to vector<4xf8E5M2>
+    return %ext : vector<4xf8E5M2>
+}
+
+// -----
+
 // CHECK-LABEL: @conversion_scalar
 // CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
 // CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf32>

>From b50a495280e32beb676f836fff8a4595c72b50a1 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Jul 2025 14:25:49 +0000
Subject: [PATCH 17/17] clang format

---
 mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 0601fd42630fe..cf9bb3a000050 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -641,7 +641,8 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
       Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
           loc, truncScaleResultType, slice, uniformScale, 0,
           /*existing=*/nullptr);
-      int64_t packedWidth = cast<VectorType>(scaleTrunc.getType()).getNumElements();
+      int64_t packedWidth =
+          cast<VectorType>(scaleTrunc.getType()).getNumElements();
       if (packedWidth != opWidth)
         scaleTrunc = rewriter.create<vector::ExtractStridedSliceOp>(
             loc, scaleTrunc, 0, sliceWidth, 1);



More information about the Mlir-commits mailing list