[clang] [llvm] [AMDGPU] Add gfx1250 wmma_scale[16]_f32_32x16x128_f4 instructions (PR #152194)

Tue Aug 5 14:18:23 PDT 2025

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/152194

>From 27300f7d2c75bcae3cf94a2c31ac958705d6a9ee Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 5 Aug 2025 12:42:36 -0700
Subject: [PATCH] [AMDGPU] Add gfx1250 wmma_scale[16]_f32_32x16x128_f4
 instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   2 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  10 +
 .../builtins-amdgcn-gfx1250-wmma-w32.cl       |  22 ++
 ...ins-amdgcn-error-gfx1250-wmma-w32-param.cl |  26 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  24 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  19 ++
 .../AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll    | 166 ++++++++++
 .../llvm.amdgcn.wmma.imm.gfx1250.w32.ll       | 308 ++++++++++++++++++
 .../llvm.amdgcn.wmma.imod.gfx1250.w32.ll      | 158 +++++++++
 llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s    | 170 ++++++++++
 .../test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s |  30 ++
 .../AMDGPU/gfx1250_dasm_wmma_w32.txt          |  90 +++++
 13 files changed, 1027 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5821a69b4de17..b16d4a22207a7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -805,6 +805,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, "V8fIiV16iIiV
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiiIiIiiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiLiIiIiLiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index e450b1415a04b..dad1f95ac710d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -894,6 +894,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
   case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
   case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
+  case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
+  case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
@@ -1172,6 +1174,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       ArgsForMatchingMatrixTypes = {3, 0, 1};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
+      ArgsForMatchingMatrixTypes = {3, 0, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
+      ArgsForMatchingMatrixTypes = {3, 0, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
+      break;
     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
       ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index af8a457bec686..bdb1a7f0bb32f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -314,6 +314,28 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
   *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
 }
 
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int scale_src0, int scale_src1)
+{
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, long scale_src0, long scale_src1)
+{
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
 // CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
 // CHECK-GFX1250-NEXT:  entry:
 // CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 83fdc8c8c60ba..49ef2e571740c 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -230,6 +230,32 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
   *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}}
 }
 
+void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, int scale_src0, int scale_src1, bool reuse)
+{
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, long scale_src0, long scale_src1, bool reuse)
+{
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+}
+
 void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
 {
   *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfadc6a58f7f1..90cfd8cedd51b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3956,6 +3956,28 @@ class AMDGPUWmmaScaleIntrinsicModsC<LLVMType scale_ty> :
      IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
+class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
+  Intrinsic<
+    [llvm_anyfloat_ty], // %D
+    [
+      llvm_anyint_ty,   // %A
+      llvm_anyint_ty,   // %B
+      llvm_i16_ty,      // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+      LLVMMatchType<0>, // %C
+      llvm_i32_ty,      // matrix_a_scale
+      llvm_i32_ty,      // matrix_a_scale_fmt
+      scale_ty,         // matrix a scale exponential
+      llvm_i32_ty,      // matrix_b_scale
+      llvm_i32_ty,      // matrix_b_scale_fmt
+      scale_ty,         // matrix b scale exponential
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>,
+     ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<11>>,
+     IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
 defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
 def int_amdgcn_wmma_f32_16x16x4_f32       : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x32_bf16     : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@@ -3984,6 +4006,8 @@ def int_amdgcn_wmma_f32_16x16x128_f8f6f4  : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
 def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4   : AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>;
 def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>;
 def int_amdgcn_wmma_f32_32x16x128_f4       : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i32_ty>;
+def int_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i64_ty>;
 }
 
 class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 74230a543ef16..868b1a21e3cd5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4801,6 +4801,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
     case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
     case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+    case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
+    case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
     case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
     case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
     case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9264935ffad7b..ce280d484da1b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1763,6 +1763,8 @@ def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16
 def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
 def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
 def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
+def F32_32X16X128_F4_SCALE_w32   : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,  v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
+def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,  v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
 def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
 def F32_BF16X64_SWMMAC_w32       : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
 def F16_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
@@ -1852,6 +1854,9 @@ defm V_SWMMAC_F16_16X16X64_F16_w32      : SWMMACInstGFX12<"v_swmmac_f16_16x16x64
 defm V_WMMA_F32_16X16X128_F8F6F4         : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
 defm V_WMMA_SCALE_F32_16X16X128_F8F6F4   : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">;
 defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">;
+
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4",   F32_32X16X128_F4_SCALE_w32, "_w32">;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
 } // End is_wmma_xdl = 1.
 
 defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
@@ -2010,6 +2015,8 @@ let SubtargetPredicate = isGFX125xOnly in {
   defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32",    int_amdgcn_wmma_f32_16x16x128_bf8_fp8,    F32_FP8BF8X128_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32",    int_amdgcn_wmma_f32_16x16x128_bf8_bf8,    F32_FP8BF8X128_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32",         int_amdgcn_wmma_f32_32x16x128_f4,         F32_32X16X128_F4_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32",   int_amdgcn_wmma_scale_f32_32x16x128_f4,   F32_32X16X128_F4_SCALE_w32>;
+  defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>;
 
   foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
     defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32",         int_amdgcn_wmma_f32_16x16x128_f8f6f4,         !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
@@ -2191,6 +2198,15 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
   let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
 }
 
+multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+   defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+   let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
+       DecoderNamespace = "GFX1250" in {
+    def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
+                   VOP3PX2e <op, LdScaleOp, WMMAP>;
+  }
+}
+
 multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
   defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
   defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
@@ -2292,6 +2308,9 @@ defm V_WMMA_F32_16X16X128_F8F6F4        : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x0
 defm V_WMMA_SCALE_F32_16X16X128_F8F6F4   : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
 defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
 
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+
 defm V_SWMMAC_F32_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
 defm V_SWMMAC_F32_16X16X64_BF16_w32     : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
 defm V_SWMMAC_F16_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 1c7c625daaa78..1bf865c414279 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -2236,6 +2236,170 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[42:43], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[42:43], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[42:43], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[42:43], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[42:43], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[42:43], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ss:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ss:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i32 %scale_src0, i32 1, i32 2, i32 %scale_src1, i1 true, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_movk_i32 s1, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_mov_b32_e32 v42, 0x64
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i32 %scale_src0, i32 0, i32 1, i32 100, i1 false, i1 true)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[44:45], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[44:45], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[44:45], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[44:45], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[44:45], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[44:45], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i64 %scale_src0, i32 1, i32 2, i64 %scale_src1, i1 true, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_mov_b64 s[2:3], 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x64
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i64 %scale_src0, i32 0, i32 1, i64 100, i1 false, i1 true)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
@@ -2573,6 +2737,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
 
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
 declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
index e602c31ebd809..48303c004f1d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -2530,6 +2530,312 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s1, 2.0
+; GISEL-NEXT:    s_mov_b32 s14, s0
+; GISEL-NEXT:    s_mov_b32 s15, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s8, s0
+; GISEL-NEXT:    s_mov_b32 s9, s0
+; GISEL-NEXT:    s_mov_b32 s10, s0
+; GISEL-NEXT:    s_mov_b32 s11, s0
+; GISEL-NEXT:    s_mov_b32 s12, s0
+; GISEL-NEXT:    s_mov_b32 s13, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
+; GFX1250-NEXT:    s_movk_i32 s0, 0x65
+; GFX1250-NEXT:    s_movk_i32 s1, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v26
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    v_mov_b32_e32 v42, 0x64
+; GISEL-NEXT:    s_mov_b32 s14, s0
+; GISEL-NEXT:    s_mov_b32 s15, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s8, s0
+; GISEL-NEXT:    s_mov_b32 s9, s0
+; GISEL-NEXT:    s_mov_b32 s10, s0
+; GISEL-NEXT:    s_mov_b32 s11, s0
+; GISEL-NEXT:    s_mov_b32 s12, s0
+; GISEL-NEXT:    s_mov_b32 s13, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v43, 0x65
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s1, 2.0
+; GISEL-NEXT:    s_mov_b32 s14, s0
+; GISEL-NEXT:    s_mov_b32 s15, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s8, s0
+; GISEL-NEXT:    s_mov_b32 s9, s0
+; GISEL-NEXT:    s_mov_b32 s10, s0
+; GISEL-NEXT:    s_mov_b32 s11, s0
+; GISEL-NEXT:    s_mov_b32 s12, s0
+; GISEL-NEXT:    s_mov_b32 s13, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
+; GFX1250-NEXT:    s_mov_b64 s[0:1], 0x65
+; GFX1250-NEXT:    s_mov_b64 s[2:3], 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v26
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x64
+; GISEL-NEXT:    s_mov_b32 s14, s0
+; GISEL-NEXT:    s_mov_b32 s15, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s8, s0
+; GISEL-NEXT:    s_mov_b32 s9, s0
+; GISEL-NEXT:    s_mov_b32 s10, s0
+; GISEL-NEXT:    s_mov_b32 s11, s0
+; GISEL-NEXT:    s_mov_b32 s12, s0
+; GISEL-NEXT:    s_mov_b32 s13, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], 0x65
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
 declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
@@ -2557,3 +2863,5 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
index 14699ce630c19..8f674f84206ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
@@ -1882,6 +1882,162 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
@@ -2177,6 +2333,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
 
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
 declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index 93e65d3444b89..8185b77beb935 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -1737,3 +1737,173 @@ v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
 // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
 // GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1]
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
index 1eae8f6ba451c..41cac9d1470ae 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
@@ -449,6 +449,16 @@ v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MAT
 // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
 // GFX1250-ERR-NEXT: {{^}}                                             ^
 
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47], v1, v2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47], v1, v2
+// GFX1250-ERR-NEXT: {{^}}                                                   ^
+
+v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}                                            ^
+
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
 // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
@@ -468,3 +478,23 @@ v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
 // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0]
 // GFX1250-ERR-NEXT: {{^}}                                                          ^
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 neg_lo:[1,0,0]
+// GFX1250-ERR-NEXT: {{^}}                                                                           ^
+
+v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_fmt:0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_fmt:0
+// GFX1250-ERR-NEXT: {{^}}                                                                           ^
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] neg_lo:[1,0,0]
+// GFX1250-ERR-NEXT: {{^}}                                                                                     ^
+
+v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] matrix_a_fmt:0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] matrix_a_fmt:0
+// GFX1250-ERR-NEXT: {{^}}                                                                                     ^
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index 2216348fa43c3..a409dac321f83 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -999,3 +999,93 @@
 
 0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c
 # GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
+
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+
+0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+
+0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+
+0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]