[llvm] 01c9a14 - AMDGPU: Define v_mfma_f32_{16x16x128|32x32x64}_f8f6f4 instructions (#116723)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 21 08:52:05 PST 2024
Author: Matt Arsenault
Date: 2024-11-21T08:51:58-08:00
New Revision: 01c9a14ccf98dba257bb36d9e9242b0bf5cdcaf2
URL: https://github.com/llvm/llvm-project/commit/01c9a14ccf98dba257bb36d9e9242b0bf5cdcaf2
DIFF: https://github.com/llvm/llvm-project/commit/01c9a14ccf98dba257bb36d9e9242b0bf5cdcaf2.diff
LOG: AMDGPU: Define v_mfma_f32_{16x16x128|32x32x64}_f8f6f4 instructions (#116723)
These use a new VOP3PX encoding for the v_mfma_scale_* instructions,
which bundles the pre-scale v_mfma_ld_scale_b32. None of the modifiers
are supported yet (op_sel, neg or clamp).
I'm not sure the intrinsic should really expose op_sel (or any of the
others). If I'm reading the documentation correctly, we should be able
to just have the raw scale operands and auto-match op_sel to byte
extract patterns.
The op_sel syntax also seems extra horrible in this usage, especially with the
usual assumed op_sel_hi=-1 behavior.
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
llvm/docs/AMDGPUUsage.rst
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
llvm/lib/Target/AMDGPU/SIDefines.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrFormats.td
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
llvm/test/MC/AMDGPU/mai-gfx950.s
llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 7ce8f2c1669d67..faf2e861451790 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -434,9 +434,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
//===----------------------------------------------------------------------===//
// GFX950 only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4fIiIiIiiIii", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIiiIii", "nc", "gfx950-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
-
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 887e0f0e05469e..b09c21a0cfcb50 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19729,7 +19729,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
(uint64_t)0);
return Builder.CreateInsertElement(I0, A, 1);
}
-
+ case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
+ case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
+ llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
+ Function *F = CGM.getIntrinsic(
+ BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
+ ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
+ : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
+ {VT, VT});
+
+ SmallVector<Value *, 9> Args;
+ for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
+ Args.push_back(EmitScalarExpr(E->getArg(I)));
+ return Builder.CreateCall(F, Args);
+ }
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 841d8fcad0fee0..ea9bdcdc211623 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -16,6 +16,7 @@ typedef half v16h __attribute__((ext_vector_type(16)));
typedef half v32h __attribute__((ext_vector_type(32)));
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
typedef int v16i __attribute__((ext_vector_type(16)));
typedef int v32i __attribute__((ext_vector_type(32)));
typedef short v2s __attribute__((ext_vector_type(2)));
@@ -431,4 +432,18 @@ v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_scale_f32_16x16x128_f8f6f4
+// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b)
+void test_mfma_scale_f32_16x16x128_f8f6f4(global v4f* out, v8i a, v8i b, v4f c, int scale_a, int scale_b)
+{
+ *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b);
+}
+
+// CHECK-GFX950-LABEL: @test_mfma_scale_f32_32x32x64_f8f6f4
+// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b)
+void test_mfma_scale_f32_32x32x64_f8f6f4(global v16f* out, v8i a, v8i b, v16f c, int scale_a, int scale_b)
+{
+ *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4af67763c40dd2..7f0300ec196e34 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -5,6 +5,7 @@ typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
+typedef int int8 __attribute__((ext_vector_type(8)));
void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -26,3 +27,17 @@ void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, fl
*out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
}
+
+void test_mfma_scale_f32_16x16x128_f8f6f4(__global float4* out, int8 a, int8 b, float4 c, int X, int Y) {
+ *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+}
+
+void test_mfma_scale_f32_32x32x64_f8f6f4(__global float16* out, int8 a, int8 b, float16 c, int X, int Y) {
+ *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, X, 0, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, X, 1, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, X, Y, 2, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 0, 0, 0, Y, X, Y); // expected-error{{argument to '__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index e0fd2aa5c58a02..2a90bda1a5b16c 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -4,12 +4,33 @@
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));
+typedef half half16 __attribute__((ext_vector_type(16)));
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat16 __attribute__((ext_vector_type(16)));
+typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
__global float16* out1, half8 a1, half8 b1, float16 c1,
- __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
+ __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2,
+ __global int4* out3, int4 a3, int4 b3, int4 c3,
+ __global int16* out4, int4 a4, int4 b4, int16 c4,
+ __global float4* out5, bfloat8 a5, bfloat8 b5, float4 c5,
+ __global float4* out6, half8 a6, half16 b6, float4 c6,
+ __global float16* out7, half8 a7, half16 b7, float16 c7,
+ __global float4* out8, bfloat8 a8, bfloat16 b8, float4 c8,
+ __global float16* out9, bfloat8 a9, bfloat16 b9, float16 c9,
+ __global int4* out10, int4 a10, int8 b10, int4 c10,
+ __global int16* out11, int4 a11, int8 b11, int16 c11,
+ __global float4* out12, int4 a12, int8 b12, float4 c12,
+ __global float16* out13, int4 a13, int8 b13, float16 c13,
+ __global float4* out14, int8 a14, int8 b14, float4 c14, int d14, int e14,
+ __global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15) {
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
+ *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
+ *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index a25b6feddbeddc..161363e0dd6bcc 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1397,6 +1397,16 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
used by hardware to control active lanes when used in EXEC register.
For example, ballot(i1 true) return EXEC mask.
+ llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4 Emit `v_mfma_scale_f32_16x16x128_f8f6f4` to set the scale factor. The
+ last 4 operands correspond to the scale inputs.
+
+ - 2-bit byte index to use for each lane for matrix A
+ - Matrix A scale values
+ - 2-bit byte index to use for each lane for matrix B
+ - Matrix B scale values
+
+ llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4 Emit `v_mfma_scale_f32_32x32x64_f8f6f4`
+
============================================== ==========================================================
.. TODO::
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 360af786c5160d..3a5fc86183ca0e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2968,6 +2968,35 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+// srcA's format is determined by cbsz. srcB's format is determined by
+// blgp.
+//
+// These should be <8 x i32> for f8 formats, <6 x i32> for f6 formats,
+// and <4 x i32> for f4 formats. If the format control bits imply a
+// smaller type than used, the high elements will be truncated.
+//
+// If the format control bits imply a larger type than used, the high
+// elements are padded with undef.
+
+class AMDGPUMfmaScaleIntrinsic<LLVMType DestTy> :
+ DefaultAttrsIntrinsic<[DestTy],
+ [llvm_anyvector_ty, llvm_anyvector_ty, DestTy,
+ llvm_i32_ty, // cbsz
+ llvm_i32_ty, // blgp
+ // llvm_i1_ty, // TODO: neg_src2
+ // llvm_i1_ty, // TODO: abs_src2
+ // llvm_i1_ty, // TODO: clamp
+ llvm_i32_ty, // op_sel (A matrix scale, 2-bits) // TODO: Make i2?
+ llvm_i32_ty, // v_mfma_ld_scale_b32 src0 (A matrix scale)
+ llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2?
+ llvm_i32_ty // v_mfma_ld_scale_b32 src1 (B matrix scale)
+ ],
+ [IntrConvergent, IntrNoMem,
+ ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
+ ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>
+ ]>;
+
defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
@@ -3119,6 +3148,8 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
+def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
+def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index d348f489d95dd3..88fa96bd049f29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -423,3 +423,6 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+
+def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
+ GISDNodeXFormEquiv<MFMALdScaleXForm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 28d215e7b3de9f..30b9e3ea11ef4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1258,6 +1258,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (isa<UndefValue>(Src)) {
return IC.replaceInstUsesWith(II, Src);
}
+ return std::nullopt;
}
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f2fe3befd04d55..f7cba2f13c77ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5742,6 +5742,18 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
+void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
+ MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
+ unsigned Val = MI.getOperand(OpIdx).getImm();
+ unsigned New = 0;
+ if (Val & 0x1)
+ New |= SISrcMods::OP_SEL_0;
+ if (Val & 0x2)
+ New |= SISrcMods::OP_SEL_1;
+ MIB.addImm(New);
+}
+
bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
return TII.isInlineConstant(Imm);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 42343104812b66..563e40267f04b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -364,6 +364,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
+ const MachineInstr &MI, int OpIdx) const;
bool isInlineImmediate(const APInt &Imm) const;
bool isInlineImmediate(const APFloat &Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 671070c70f0c41..6a5065cd4a0e8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -40,7 +40,7 @@ class AMDGPUInst <dag outs, dag ins, string asm = "",
// instructions to not match without killing the whole decode process. It is
// mainly used for ARM, but Tablegen expects this field to exist or it fails
// to build the decode table.
- field bits<96> SoftFail = 0;
+ field bits<128> SoftFail = 0; // FIXME: If this is smaller than largest instruction, DecodeEmitter crashes
let DecoderNamespace = Namespace;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b648b68f3bd2b0..7aae9194b5cd02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4769,6 +4769,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
: getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ OpdsMapping[0] =
+ Info->mayNeedAGPRs()
+ ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] =
+ Info->mayNeedAGPRs()
+ ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+ OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
+ OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 06df08feda8fa4..f90121a86c846c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -512,6 +512,17 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
return DecoderUInt128(Lo, Hi);
}
+static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+ assert(Bytes.size() >= 16);
+ uint64_t Lo =
+ support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+ Bytes = Bytes.slice(8);
+ uint64_t Hi =
+ support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+ Bytes = Bytes.slice(8);
+ return DecoderUInt128(Lo, Hi);
+}
+
DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes_,
uint64_t Address,
@@ -548,6 +559,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+ } else if (Bytes.size() >= 16 &&
+ STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
+ DecoderUInt128 DecW = eat16Bytes(Bytes);
+ if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
+ break;
+
+ // Reinitialize Bytes
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
}
if (Bytes.size() >= 8) {
@@ -759,6 +779,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
convertSDWAInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
+ convertMAIInst(MI);
+
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst_in);
if (VDstIn_Idx != -1) {
@@ -837,6 +860,58 @@ void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
}
}
+/// Adjust the register values used by V_MFMA_F8F6F4_f8_f8 instructions to the
+/// appropriate subregister for the used format width.
+static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
+ MCOperand &MO, uint8_t NumRegs) {
+ switch (NumRegs) {
+ case 4:
+ return MO.setReg(MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3));
+ case 6:
+ return MO.setReg(
+ MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
+ case 8:
+ // No-op in cases where one operand is still f8/bf8.
+ return;
+ default:
+ llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
+ }
+}
+
+/// f8f6f4 instructions have
diff erent pseudos depending on the used formats. In
+/// the disassembler table, we only have the variants with the largest register
+/// classes which assume using an fp8/bf8 format for both operands. The actual
+/// register class depends on the format in blgp and cbsz operands. Adjust the
+/// register classes depending on the used format.
+void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
+ int BlgpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::blgp);
+ if (BlgpIdx == -1)
+ return;
+
+ int CbszIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cbsz);
+
+ unsigned CBSZ = MI.getOperand(CbszIdx).getImm();
+ unsigned BLGP = MI.getOperand(BlgpIdx).getImm();
+
+ const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
+ AMDGPU::getMFMA_F8F6F4_WithFormatArgs(CBSZ, BLGP, MI.getOpcode());
+ if (!AdjustedRegClassOpcode ||
+ AdjustedRegClassOpcode->Opcode == MI.getOpcode())
+ return;
+
+ MI.setOpcode(AdjustedRegClassOpcode->Opcode);
+ int Src0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ int Src1Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
+ AdjustedRegClassOpcode->NumRegsSrcA);
+ adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
+ AdjustedRegClassOpcode->NumRegsSrcB);
+}
+
struct VOPModifiers {
unsigned OpSel = 0;
unsigned OpSelHi = 0;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 694cd7a9bfd282..3e20a2ab9e66ca 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -204,6 +204,7 @@ class AMDGPUDisassembler : public MCDisassembler {
void convertVINTERPInst(MCInst &MI) const;
void convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
void convertSDWAInst(MCInst &MI) const;
+ void convertMAIInst(MCInst &MI) const;
void convertDPP8Inst(MCInst &MI) const;
void convertMIMGInst(MCInst &MI) const;
void convertVOP3DPPInst(MCInst &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 3c9f6d2938075b..56ed29ede02c23 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -59,6 +59,10 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
if (STI->hasFeature(AMDGPU::FeatureNSAEncoding))
return 20;
+ // VOP3PX encoding.
+ if (STI->hasFeature(AMDGPU::FeatureGFX950Insts))
+ return 16;
+
// 64-bit instruction with 32-bit literal.
if (STI->hasFeature(AMDGPU::FeatureVOP3Literal))
return 12;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 8f297726a0df88..f812ae652b63d0 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1048,6 +1048,18 @@ enum Offset_COV5 : unsigned {
} // namespace ImplicitArg
+namespace MFMAScaleFormats {
+// Enum value used in cbsz/blgp for F8F6F4 MFMA operations to select the matrix
+// format.
+enum MFMAScaleFormats {
+ FP8_E4M3 = 0,
+ FP8_E5M2 = 1,
+ FP6_E2M3 = 2,
+ FP6_E3M2 = 3,
+ FP4_E2M1 = 4
+};
+} // namespace MFMAScaleFormats
+
namespace VirtRegFlag {
// Virtual register flags used for various target specific handlings during
// codegen.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c8d55bbc01316..1406938592b2cb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15454,6 +15454,23 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MRI.setRegClass(Op.getReg(), NewRC);
}
+ if (TII->isMAI(MI)) {
+ // The ordinary src0, src1, src2 were legalized above.
+ //
+ // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
+ // as a separate instruction.
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::scale_src0);
+ if (Src0Idx != -1) {
+ int Src1Idx = Src0Idx + 2;
+ assert(Src1Idx = AMDGPU::getNamedOperandIdx(
+ MI.getOpcode(), AMDGPU::OpName::scale_src1));
+ if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
+ TII->usesConstantBus(MRI, MI, Src1Idx))
+ TII->legalizeOpWithMove(MI, Src1Idx);
+ }
+ }
+
if (!HasAGPRs)
return;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index dd1ab2c628715d..267c9a94b90968 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -300,6 +300,11 @@ class Enc96 {
int Size = 12;
}
+class Enc128 {
+ field bits<128> Inst;
+ int Size = 16;
+}
+
def CPolBit {
int GLC = 0;
int SLC = 1;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1f7fff76d15210..e55418326a4bd0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1115,6 +1115,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const MachineOperand &MO,
const MCOperandInfo &OpInfo) const;
+ bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineInstr &MI,
+ int OpIdx) const {
+ return usesConstantBus(MRI, MI.getOperand(OpIdx),
+ MI.getDesc().operands()[OpIdx]);
+ }
+
/// Return true if this instruction has any modifiers.
/// e.g. src[012]_mod, omod, clamp.
bool hasModifiers(unsigned Opcode) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d2024cf915874d..a6496cd4a61f19 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -914,6 +914,16 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
+def MFMALdScaleXForm : SDNodeXForm<timm, [{
+ unsigned Val = N->getZExtValue();
+ unsigned New = 0;
+ if (Val & 0x1)
+ New |= SISrcMods::OP_SEL_0;
+ if (Val & 0x2)
+ New |= SISrcMods::OP_SEL_1;
+ return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
+}]>;
+
def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{
const SITargetLowering &Lowering =
*static_cast<const SITargetLowering *>(getTargetLowering());
@@ -1515,6 +1525,10 @@ class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <
def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>;
def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
+def MFMALdScaleModifierOp : TImmLeaf<i32, [{
+ return isUInt<2>(Imm);
+}], MFMALdScaleXForm>;
+
//===----------------------------------------------------------------------===//
// Complex patterns
//===----------------------------------------------------------------------===//
@@ -2655,6 +2669,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field string AsmVOPDX = getAsmVOPDPart<NumSrcArgs, "X">.ret;
field string AsmVOPDY = getAsmVOPDPart<NumSrcArgs, "Y">.ret;
field string TieRegDPP = "$old";
+ field bit IsSMFMAC = false;
+ field bit HasAbid = !and(IsMAI, HasSrc1);
}
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
@@ -2851,6 +2867,31 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
+
+def VOP_V4F32_V8I32_V6I32_V4F32 : VOPProfile <[v4f32, v8i32, v6i32, v4f32]>;
+def VOP_V4F32_V6I32_V8I32_V4F32 : VOPProfile <[v4f32, v6i32, v8i32, v4f32]>;
+def VOP_V4F32_V6I32_V6I32_V4F32 : VOPProfile <[v4f32, v6i32, v6i32, v4f32]>;
+
+def VOP_V4F32_V8I32_V4I32_V4F32 : VOPProfile <[v4f32, v8i32, v4i32, v4f32]>;
+def VOP_V4F32_V4I32_V8I32_V4F32 : VOPProfile <[v4f32, v4i32, v8i32, v4f32]>;
+def VOP_V4F32_V6I32_V4I32_V4F32 : VOPProfile <[v4f32, v6i32, v4i32, v4f32]>;
+def VOP_V4F32_V4I32_V6I32_V4F32 : VOPProfile <[v4f32, v4i32, v6i32, v4f32]>;
+def VOP_V4F32_V4I32_V4I32_V4F32 : VOPProfile <[v4f32, v4i32, v4i32, v4f32]>;
+
+def VOP_V16F32_V8I32_V8I32_V16F32 : VOPProfile <[v16f32, v8i32, v8i32, v16f32]>;
+def VOP_V16F32_V8I32_V6I32_V16F32 : VOPProfile <[v16f32, v8i32, v6i32, v16f32]>;
+def VOP_V16F32_V6I32_V8I32_V16F32 : VOPProfile <[v16f32, v6i32, v8i32, v16f32]>;
+def VOP_V16F32_V6I32_V6I32_V16F32 : VOPProfile <[v16f32, v6i32, v6i32, v16f32]>;
+
+def VOP_V16F32_V8I32_V4I32_V16F32 : VOPProfile <[v16f32, v8i32, v4i32, v16f32]>;
+def VOP_V16F32_V4I32_V8I32_V16F32 : VOPProfile <[v16f32, v4i32, v8i32, v16f32]>;
+def VOP_V16F32_V6I32_V4I32_V16F32 : VOPProfile <[v16f32, v6i32, v4i32, v16f32]>;
+def VOP_V16F32_V4I32_V6I32_V16F32 : VOPProfile <[v16f32, v4i32, v6i32, v16f32]>;
+def VOP_V16F32_V4I32_V4I32_V16F32 : VOPProfile <[v16f32, v4i32, v4i32, v16f32]>;
+
+def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
+def VOP_V16I32_V4I32_V4I32_V16I32 : VOPProfile <[v16i32, v4i32, v4i32, v16i32]>;
class Commutable_REV <string revOp, bit isOrig> {
@@ -3114,6 +3155,16 @@ def getVCMPXOpFromVCMP : InstrMapping {
let ValueCols = [["1"]];
}
+// Map encoded mfma(_scale)?_f8f6f4 instructions depending on the
+// number of registers required for the used format.
+def getMFMA_F8F6F4_WithSize : GenericTable {
+ let FilterClass = "MFMA_F8F6F4_WithSizeTable";
+ let CppTypeName = "MFMA_F8F6F4_Info";
+ let Fields = [ "Opcode", "F8F8Opcode", "NumRegsSrcA", "NumRegsSrcB" ];
+ let PrimaryKey = [ "NumRegsSrcA", "NumRegsSrcB", "F8F8Opcode" ];
+ let PrimaryKeyName = "getMFMA_F8F6F4_InstWithNumRegs" ;
+}
+
def FP8DstByteSelTable : GenericTable {
let FilterClass = "VOP3_Pseudo";
let CppTypeName = "FP8DstByteSelInfo";
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 46b2b4a389200a..e3baeed01841ab 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1346,11 +1346,14 @@ class AVSrcOperand<RegisterClass regClass, string width>
def AVSrc_32 : AVSrcOperand<AV_32, "OPW32">;
def AVSrc_64 : AVSrcOperand<AV_64, "OPW64">;
def AVSrc_128 : AVSrcOperand<AV_128, "OPW128">;
+def AVSrc_192 : AVSrcOperand<AV_192, "OPW192">;
+def AVSrc_256 : AVSrcOperand<AV_256, "OPW256">;
class AVDstOperand<RegisterClass regClass, string width>
: AVOperand<regClass, "decodeAV10", width>;
def AVDst_128 : AVDstOperand<AV_128, "OPW128">;
+def AVDst_256 : AVDstOperand<AV_256, "OPW256">;
def AVDst_512 : AVDstOperand<AV_512, "OPW512">;
class AVLdStOperand<RegisterClass regClass, string width>
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 501d00b1f308d9..c7e4659b15d299 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -415,6 +415,8 @@ struct FP8DstByteSelInfo {
#define GET_WMMAOpcode2AddrMappingTable_IMPL
#define GET_WMMAOpcode3AddrMappingTable_DECL
#define GET_WMMAOpcode3AddrMappingTable_IMPL
+#define GET_getMFMA_F8F6F4_WithSize_DECL
+#define GET_getMFMA_F8F6F4_WithSize_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -523,6 +525,30 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info ? Info->is_gfx940_xdl : false;
}
+static uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
+ switch (EncodingVal) {
+ case MFMAScaleFormats::FP6_E2M3:
+ case MFMAScaleFormats::FP6_E3M2:
+ return 6;
+ case MFMAScaleFormats::FP4_E2M1:
+ return 4;
+ case MFMAScaleFormats::FP8_E4M3:
+ case MFMAScaleFormats::FP8_E5M2:
+ default:
+ return 8;
+ }
+
+ llvm_unreachable("covered switch over mfma scale formats");
+}
+
+const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
+ unsigned BLGP,
+ unsigned F8F8Opcode) {
+ uint8_t SrcANumRegs = mfmaScaleF8F6F4FormatToNumRegs(CBSZ);
+ uint8_t SrcBNumRegs = mfmaScaleF8F6F4FormatToNumRegs(BLGP);
+ return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
+}
+
unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
if (ST.hasFeature(AMDGPU::FeatureGFX12Insts))
return SIEncodingFamily::GFX12;
@@ -2912,6 +2938,7 @@ const AlwaysUniform *lookupAlwaysUniform(unsigned Intr);
#define GET_Gfx9BufferFormat_IMPL
#define GET_Gfx10BufferFormat_IMPL
#define GET_Gfx11PlusBufferFormat_IMPL
+
#include "AMDGPUGenSearchableTables.inc"
} // end anonymous namespace
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 88a6d75b72c7d0..b0581711961b13 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -581,6 +581,18 @@ unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST);
LLVM_READONLY
CanBeVOPD getCanBeVOPD(unsigned Opc);
+struct MFMA_F8F6F4_Info {
+ unsigned Opcode;
+ unsigned F8F8Opcode;
+ uint8_t NumRegsSrcA;
+ uint8_t NumRegsSrcB;
+};
+
+LLVM_READONLY
+const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
+ unsigned BLGP,
+ unsigned F8F8Opcode);
+
LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 876d4e1acf5964..a6ec1dba23aad2 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -18,6 +18,7 @@ class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
// overloaded in meaning and the logic in printOperandAndFPInputMods is
// wrong for vop3p
let AsmVOP3Base = AsmVOP3P;
+ bit IsSMFMAC = false;
}
def VOP_MFMA_LD_SCALE : VOP3P_Profile<VOPProfile<[untyped, i32, i32, untyped]>, VOP3P_LD_SCALE> {
@@ -542,19 +543,23 @@ def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
}
class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
- RegisterOperand SrcABRC = AVSrc_32>
+ RegisterOperand SrcARC = AVSrc_32, RegisterOperand SrcBRC = SrcARC>
: VOP3P_Profile<P, VOP3_MAI> {
+ bit HasAbid = true;
let DstRC = _DstRC;
- let Src0RC64 = SrcABRC;
- let Src1RC64 = SrcABRC;
+ let Src0RC64 = SrcARC;
+ let Src1RC64 = SrcBRC;
let Src2RC64 = _SrcRC;
let HasOpSel = 0;
let HasClamp = 0;
let HasIntClamp = 0;
let HasOMod = 0;
let HasModifiers = 0;
- let AsmVOP3Base = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
- let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, CBSZ:$cbsz, ABID:$abid, blgp:$blgp);
+ let AsmVOP3Base = "$vdst, $src0, $src1, $src2$cbsz"#!if(HasAbid,"$abid","")#"$blgp";
+ let Ins64 = !con(
+ (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, CBSZ:$cbsz),
+ !if(HasAbid, (ins ABID:$abid), (ins)),
+ (ins blgp:$blgp));
let InsVOP3Base = Ins64;
// Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
// We then create two versions of the instruction: with tied dst and src2
@@ -572,6 +577,7 @@ class VOPProfileSMFMAC<VOPProfile P, RegisterOperand _DstRC,
let Asm64 = " $vdst, $src0, $src1, $idx$cbsz$abid";
let Outs64 = (outs DstRC:$vdst);
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, VRegSrc_32:$idx, CBSZ:$cbsz, ABID:$abid, Src2RC64:$src2);
+ let IsSMFMAC = true;
}
def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>;
@@ -639,14 +645,112 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+
+let HasAbid = false in {
+// For f32_16x16x128_f8f6f4 - f8 x f8 case
+def VOPProfileMAI_F32_V8I32_V8I32_X128 : VOPProfileMAI<VOP_V4F32_V8I32_V8I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_256>;
+def VOPProfileMAI_F32_V8I32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V8I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_256>;
+
+// For f32_16x16x128_f8f6f4 - f8 x f6 case
+def VOPProfileMAI_F32_V8I32_V6I32_X128 : VOPProfileMAI<VOP_V4F32_V8I32_V6I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_256, AVSrc_192>;
+def VOPProfileMAI_F32_V8I32_V6I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V6I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_256, AVSrc_192>;
+
+// For f32_16x16x128_f8f6f4 - f6 x f8 case
+def VOPProfileMAI_F32_V6I32_V8I32_X128 : VOPProfileMAI<VOP_V4F32_V6I32_V8I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_192, AVSrc_256>;
+def VOPProfileMAI_F32_V6I32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V6I32_V8I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_192, AVSrc_256>;
+
+// For f32_16x16x128_f8f6f4 - f6 x f6 case
+def VOPProfileMAI_F32_V6I32_V6I32_X128 : VOPProfileMAI<VOP_V4F32_V6I32_V6I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_192, AVSrc_192>;
+def VOPProfileMAI_F32_V6I32_V6I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V6I32_V6I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_192, AVSrc_192>;
+
+// For f32_16x16x128_f8f6f4 - f6 x f4 case
+def VOPProfileMAI_F32_V6I32_V4I32_X128 : VOPProfileMAI<VOP_V4F32_V6I32_V4I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_192, AVSrc_128>;
+def VOPProfileMAI_F32_V6I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V6I32_V4I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_192, AVSrc_128>;
+
+// For f32_16x16x128_f8f6f4 - f4 x f6 case
+def VOPProfileMAI_F32_V4I32_V6I32_X128 : VOPProfileMAI<VOP_V4F32_V4I32_V6I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_128, AVSrc_192>;
+def VOPProfileMAI_F32_V4I32_V6I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V4I32_V6I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_128, AVSrc_192>;
+
+// For f32_16x16x128_f8f6f4 - f8 x f4 case
+def VOPProfileMAI_F32_V8I32_V4I32_X128 : VOPProfileMAI<VOP_V4F32_V8I32_V4I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_256, AVSrc_128>;
+def VOPProfileMAI_F32_V8I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V4I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_256, AVSrc_128>;
+
+// For f32_16x16x128_f8f6f4 - f4 x f8 case
+def VOPProfileMAI_F32_V4I32_V8I32_X128 : VOPProfileMAI<VOP_V4F32_V4I32_V8I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_128, AVSrc_256>;
+def VOPProfileMAI_F32_V4I32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V4I32_V8I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_128, AVSrc_256>;
+
+// For f32_16x16x128_f8f6f4 - f4 x f4 case
+def VOPProfileMAI_F32_V4I32_V4I32_X128 : VOPProfileMAI<VOP_V4F32_V4I32_V4I32_V4F32, AISrc_128_f32, ADst_128, AVSrc_128, AVSrc_128>;
+def VOPProfileMAI_F32_V4I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V4I32_V4I32_V4F32, VISrc_128_f32, VDst_128, AVSrc_128, AVSrc_128>;
+
+// For f32_32x32x64_f8f6f4 - f8 x f8 case
+def VOPProfileMAI_F32_V8I32_V8I32_X512 : VOPProfileMAI<VOP_V16F32_V8I32_V8I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_256>;
+def VOPProfileMAI_F32_V8I32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V8I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_256>;
+
+// For f32_32x32x64_f8f6f4 - f8 x f6 case
+def VOPProfileMAI_F32_V8I32_V6I32_X512 : VOPProfileMAI<VOP_V16F32_V8I32_V6I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_256, AVSrc_192>;
+def VOPProfileMAI_F32_V8I32_V6I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V6I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_256, AVSrc_192>;
+
+// For f32_32x32x64_f8f6f4 - f8 x f4 case
+def VOPProfileMAI_F32_V8I32_V4I32_X512 : VOPProfileMAI<VOP_V16F32_V8I32_V4I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_256, AVSrc_128>;
+def VOPProfileMAI_F32_V8I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V4I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_256, AVSrc_128>;
+
+// For f32_32x32x64_f8f6f4 - f4 x f8 case
+def VOPProfileMAI_F32_V4I32_V8I32_X512 : VOPProfileMAI<VOP_V16F32_V4I32_V8I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_128, AVSrc_256>;
+def VOPProfileMAI_F32_V4I32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V8I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_128, AVSrc_256>;
+
+// For f32_32x32x64_f8f6f4 - f6 x f8 case
+def VOPProfileMAI_F32_V6I32_V8I32_X512 : VOPProfileMAI<VOP_V16F32_V6I32_V8I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_192, AVSrc_256>;
+def VOPProfileMAI_F32_V6I32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V6I32_V8I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_192, AVSrc_256>;
+
+// For f32_32x32x64_f8f6f4 - f6 x f6 case
+def VOPProfileMAI_F32_V6I32_V6I32_X512 : VOPProfileMAI<VOP_V16F32_V6I32_V6I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_192, AVSrc_192>;
+def VOPProfileMAI_F32_V6I32_V6I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V6I32_V6I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_192, AVSrc_192>;
+
+// For f32_32x32x64_f8f6f4 - f6 x f4 case
+def VOPProfileMAI_F32_V6I32_V4I32_X512 : VOPProfileMAI<VOP_V16F32_V6I32_V4I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_192, AVSrc_128>;
+def VOPProfileMAI_F32_V6I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V6I32_V4I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_192, AVSrc_128>;
+
+// For f32_32x32x64_f8f6f4 - f4 x f6 case
+def VOPProfileMAI_F32_V4I32_V6I32_X512 : VOPProfileMAI<VOP_V16F32_V4I32_V6I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_128, AVSrc_192>;
+def VOPProfileMAI_F32_V4I32_V6I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V6I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_128, AVSrc_192>;
+
+// For f32_32x32x64_f8f6f4 - f4 x f4 case
+def VOPProfileMAI_F32_V4I32_V4I32_X512 : VOPProfileMAI<VOP_V16F32_V4I32_V4I32_V16F32, AISrc_512_f32, ADst_512, AVSrc_128, AVSrc_128>;
+def VOPProfileMAI_F32_V4I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V4I32_V16F32, VISrc_512_f32, VDst_512, AVSrc_128, AVSrc_128>;
+}
+
+
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
string FMAOp = Name;
}
-class MAIFrag<SDPatternOperator Op, code pred> : PatFrag <
- (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$abid, node:$blgp),
- (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
+class MFMA_F8F6F4_WithSizeTable<int A, int B, Instruction ThisVariant, Instruction F8F8Variant> {
+ Instruction F8F8Opcode = F8F8Variant;
+ Instruction Opcode = ThisVariant;
+ bits<8> NumRegsSrcA = A;
+ bits<8> NumRegsSrcB = B;
+}
+
+class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
+ MFMA_F8F6F4_WithSizeTable<!srl(ps.Pfl.Src0VT.Size, 5),
+ !srl(ps.Pfl.Src1VT.Size, 5),
+ !cast<Instruction>(NAME),
+ !cast<Instruction>(F8F8Op)> {
+}
+
+// Currently assumes scaled instructions never have abid
+class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
+ !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
+ node:$scale_src0_opsel, node:$scale_src0,
+ node:$scale_src1_opsel, node:$scale_src1),
+ !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz),
+ !if(HasAbid, (ops node:$abid), (ops)),
+ (ops node:$blgp))),
+ !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1),
+ !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
+ (Op $src0, $src1, $src2, $cbsz, $blgp))),
pred
>;
@@ -666,11 +770,15 @@ defvar MayNotNeedAGPRs_gisel = [{
return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
}];
-class AgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNeedAGPRs> {
+class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true,
+ bit Scaled = false> :
+ MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> {
let GISelPredicateCode = MayNeedAGPRs_gisel;
}
-class VgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNotNeedAGPRs> {
+class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true,
+ bit Scaled = false> :
+ MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> {
let GISelPredicateCode = MayNotNeedAGPRs_gisel;
}
@@ -681,27 +789,51 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
} // End isMoveImm = 1
} // End isAsCheapAsAMove = 1, isReMaterializable = 1
-class MAIInst<string OpName, VOPProfile P, SDPatternOperator node>
- : VOP3InstBase<OpName, P, node> {
+class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled = false>
+ : VOP3InstBase<OpName, P, node, /*IsVOP2=*/0, Scaled> {
let SubtargetPredicate = HasMAIInsts;
Instruction Opcode = !cast<Instruction>(NAME);
bit is_dgemm = 0;
bit is_gfx940_xdl = 0;
+ let PseudoInstr = NAME; // FIXME: Why is this not the default
}
-multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
+// FIXME: Intrinsic should probably not have op_sel operands, we can
+// pattern match byte select patterns into op_sel.
+// FIXME: Missing neg and clamp modifiers
+//
+// FIXME: Usual syntax for op_sel is quite hostile here.
+class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
+ MAIInst<OpName, BaseInst.Pfl, node, /*Scaled=*/true> {
+ // Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them.
+ let InOperandList = !con(BaseInst.InOperandList,
+ (ins VSrc_b32:$scale_src0,
+ VSrc_b32:$scale_src1,
+ op_sel0:$scale_src0_opsel,
+ op_sel_hi0:$scale_src1_opsel));
+ let AsmOperands =
+ "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1"
+ "$scale_src0_opsel$scale_src1_opsel$cbsz$blgp";
+
+ let FixedSize = 1;
+ let Size = 16;
+}
+
+multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
+ bit HasAbid = true,
+ bit Scaled = false> {
defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap;
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
- !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node>)>,
+ !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
MFMATable<0, NAME # "_e64">;
let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in
def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
- !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node>)>,
+ !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
MFMATable<0, NAME # "_vgprcd_e64">;
}
@@ -710,18 +842,77 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
isConvertibleToThreeAddress = NoDstOverlap,
Mnemonic = OpName in {
def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
- !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node>)>,
+ !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
MFMATable<1, NAME # "_e64">;
let OtherPredicates = [isGFX90APlus] in
def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
- !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node>)>,
+ !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>,
MFMATable<1, NAME # "_vgprcd_e64">;
}
}
} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
}
+// Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32
+multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOperator node> {
+ defvar VariantSuffix = !subst(!toupper(OpName), "", NAME); // Drop the main opcode name prefix to get the "_fN_fM" suffix.
+ defvar UnscaledOpName = UnscaledOpName_#VariantSuffix;
+
+ defvar HasAbid = false;
+
+ defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap;
+
+ def _e64 : ScaledMAIInst<OpName,
+ !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>,
+ MFMATable<0, NAME # "_e64">;
+
+ def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd",
+ !cast<MAIInst>(UnscaledOpName#"_vgprcd_e64"), !if(NoDstOverlap, null_frag, VgprMAIFrag<node, HasAbid, true>)>,
+ MFMATable<0, NAME # "_vgprcd_e64">;
+
+ if NoDstOverlap then {
+ let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
+ isConvertibleToThreeAddress = NoDstOverlap,
+ Mnemonic = UnscaledOpName_ in {
+ def _mac_e64 : ScaledMAIInst<OpName # "_mac",
+ !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>,
+ MFMATable<1, NAME # "_e64">;
+
+ def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd",
+ !cast<MAIInst>(UnscaledOpName # "_mac_vgprcd_e64"), VgprMAIFrag<node, HasAbid, true>>,
+ MFMATable<1, NAME # "_vgprcd_e64">;
+ }
+ }
+}
+
+// Each of SrcA and SrcB can be encoded using 3
diff erent sizes, so
+// define 9 permutations of register classes.
+multiclass MAIInst_SrcFormats_mc<string OpName, string ProfileSuffix, SDPatternOperator node = null_frag> {
+ defvar HasAbid = false;
+ defm _f8_f8 : MAIInst<OpName, "F32_V8I32_V8I32"#ProfileSuffix, node, HasAbid>;
+ defm _f8_f6 : MAIInst<OpName, "F32_V8I32_V6I32"#ProfileSuffix, node, HasAbid>;
+ defm _f6_f8 : MAIInst<OpName, "F32_V6I32_V8I32"#ProfileSuffix, node, HasAbid>;
+ defm _f6_f6 : MAIInst<OpName, "F32_V6I32_V6I32"#ProfileSuffix, node, HasAbid>;
+ defm _f8_f4 : MAIInst<OpName, "F32_V8I32_V4I32"#ProfileSuffix, node, HasAbid>;
+ defm _f4_f8 : MAIInst<OpName, "F32_V4I32_V8I32"#ProfileSuffix, node, HasAbid>;
+ defm _f6_f4 : MAIInst<OpName, "F32_V6I32_V4I32"#ProfileSuffix, node, HasAbid>;
+ defm _f4_f6 : MAIInst<OpName, "F32_V4I32_V6I32"#ProfileSuffix, node, HasAbid>;
+ defm _f4_f4 : MAIInst<OpName, "F32_V4I32_V4I32"#ProfileSuffix, node, HasAbid>;
+}
+
+multiclass MAIInst_SrcFormats_Scaled_mc<string OpName, string UnscaledOpName, SDPatternOperator node> {
+ defm _f8_f8 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f8_f6 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f6_f8 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f6_f6 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f8_f4 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f4_f8 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f6_f4 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f4_f6 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+ defm _f4_f4 : ScaledMAIInst_mc<OpName, UnscaledOpName, node>;
+}
+
defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>;
defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>;
defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>;
@@ -753,6 +944,20 @@ let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
+
+defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_16x16x128f8f6f4",
+ "_X128">;
+defm V_MFMA_F32_32X32X64_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_32x32x64f8f6f4",
+ "_X512">;
+
+defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_Scaled_mc<
+ "v_mfma_scale_f32_16x16x128_f8f6f4", "V_MFMA_F32_16X16X128_F8F6F4",
+ int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4>;
+
+defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : MAIInst_SrcFormats_Scaled_mc<
+ "v_mfma_scale_f32_32x32x64_f8f6f4",
+ "V_MFMA_F32_32X32X64_F8F6F4",
+ int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4>;
}
let SubtargetPredicate = HasGFX950Insts in {
@@ -869,7 +1074,6 @@ def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>;
def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>;
def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>;
def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>;
-def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
@@ -1676,6 +1880,26 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
}
}
+multiclass VOP3P_Real_MFMA_F8F6F4_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+ VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
+ VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
+
+ defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8";
+
+ let AssemblerPredicate = isGFX940Plus,
+ DecoderNamespace = "GFX940",
+ AsmString = Name # PS_ACD.AsmOperands,
+ Constraints = "" in {
+ def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
+ VOP3Pe_MAI <op, PS_ACD.Pfl, 1>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS_ACD, F8F8Name#"_gfx940_acd">;
+
+ def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>,
+ VOP3Pe_MAI <op, PS_VCD.Pfl, 0>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS_VCD, F8F8Name#"_gfx940_vcd">;
+ } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940"
+}
+
multiclass VOP3P_Real_MFMA_gfx950<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
@@ -1686,6 +1910,55 @@ multiclass VOP3P_Real_MFMA_gfx950<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
}
+multiclass VOP3P_Real_MFMA_F8F6F4_gfx950_mc<bits<7> op, string Name> {
+ defm _f8_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+
+ let isAsmParserOnly = true in { // Disable ambiguous disassembly.
+ defm _f8_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f6_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f8_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f4_f8 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f6_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f6_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f4_f6 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ defm _f4_f4 : VOP3P_Real_MFMA_F8F6F4_gfx940<op, Name>;
+ }
+}
+
+multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
+ defvar PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64");
+ defvar PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64");
+ defvar Name = PS_ACD.Mnemonic;
+ defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8";
+
+ let SubtargetPredicate = HasGFX950Insts,
+ DecoderNamespace = "GFX940",
+ AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
+ def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
+ VOP3PXe <op, PS_ACD.Pfl, /*acc_cd=*/1>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS_ACD, F8F8Name#"_gfx940_acd">;
+
+ def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>,
+ VOP3PXe <op, PS_VCD.Pfl, /*acc_cd=*/0>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS_VCD, F8F8Name#"_gfx940_vcd">;
+ }
+}
+
+multiclass VOP3PX_Real_ScaledMFMA_F8F6F4_mc<bits<7> op> {
+ defm _f8_f8 : VOP3PX_Real_ScaledMFMA<op>;
+
+ let isAsmParserOnly = 1 in { // Disable ambiguous disassembly.
+ defm _f8_f6 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f6_f8 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f8_f4 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f4_f8 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f6_f6 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f6_f4 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f4_f6 : VOP3PX_Real_ScaledMFMA<op>;
+ defm _f4_f4 : VOP3PX_Real_ScaledMFMA<op>;
+ }
+}
+
multiclass VOP3P_Real_MFMA_vi<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
@@ -1797,6 +2070,10 @@ defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>;
+defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mfma_f32_16x16x128_f8f6f4">;
+defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2d>;
+defm V_MFMA_F32_32X32X64_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2e, "v_mfma_f32_32x32x64_f8f6f4">;
+defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2e>;
defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a6e6adac04e5a9..aa9758219db914 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -448,7 +448,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
-class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
+class VOP3Pe_MAI_Base {
bits<8> vdst;
bits<10> src0;
bits<10> src1;
@@ -456,11 +456,13 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
bits<3> blgp;
bits<3> cbsz;
bits<4> abid;
+}
+class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64, VOP3Pe_MAI_Base {
let Inst{7-0} = vdst;
let Inst{10-8} = !if(P.HasSrc1, cbsz, 0);
- let Inst{14-11} = !if(P.HasSrc1, abid, 0);
+ let Inst{14-11} = !if(P.HasAbid, abid, 0);
let Inst{15} = acc_cd;
@@ -506,6 +508,59 @@ class VOP3Pe_SMFMAC <bits<7> op> : Enc64 {
let Inst{63-61} = blgp;
}
+class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_MAI_Base {
+ bits<9> scale_src0;
+ bits<9> scale_src1;
+
+ bits<2> scale_src0_opsel;
+ bits<2> scale_src1_opsel;
+
+ // Inst{7-0} = unused
+ // Inst{10-8} = neg_hi;
+ // Inst{13-11} = op_sel
+ let Inst{11} = scale_src0_opsel{0};
+ let Inst{12} = scale_src1_opsel{0};
+ // Inst{13} = unused op_sel
+ // Inst{14} = unused op_sel_hi2
+
+ let Inst{31-16} = 0b1101001110101100;
+ let Inst{40-32} = scale_src0;
+ let Inst{49-41} = scale_src1;
+ // Inst{50-58} = unused
+ // Inst{60-59} = op_sel_hi;
+ let Inst{59} = scale_src0_opsel{1};
+ let Inst{60} = scale_src1_opsel{1};
+ // Inst{63-61} = neg;
+
+ // The high half of the encoding is the unscaled mfma op.
+ //
+ // FIXME: Defining the encoding in terms of the base instruction
+ // seems to not work, results in all 0 encoding, so replicate all
+ // the fields from VOP3Pe_MAI, shifted up by 64
+ //
+ // defvar Hi = VOP3Pe_MAI<op, MFMAPfl, acc_cd>;
+ // let Inst{127-64} = Hi.Inst;
+
+ let Inst{71-64} = vdst;
+ let Inst{74-72} = !if(MFMAPfl.HasSrc1, cbsz, 0);
+
+ // abid must be 1 to use a scale.
+ let Inst{78-75} = 0b0001; // abid
+
+ let Inst{79} = acc_cd;
+
+ let Inst{86-80} = op;
+ let Inst{95-87} = 0x1a7; //encoding
+ let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, 0);
+ let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, 0);
+ let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, 0);
+
+ let Inst{123} = !if(MFMAPfl.HasSrc0, src0{9}, 0); // acc(0)
+ let Inst{124} = !if(MFMAPfl.HasSrc1, src1{9}, 0); // acc(1)
+
+ let Inst{127-125} = !if(MFMAPfl.HasSrc1, blgp, 0);
+}
+
class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
let Inst{31-23} = 0x198; //encoding
}
@@ -1343,15 +1398,39 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
}
class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret = !if(!eq(P.Src0VT, P.Src1VT),
+ list<dag> mfma_with_abid = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+ timm:$cbsz, timm:$abid, timm:$blgp))];
+ list<dag> mfma_no_abid = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+ timm:$cbsz, timm:$blgp))];
+
+ list<dag> ret = !if(!not(P.IsSMFMAC),
// mfma
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
- timm:$cbsz, timm:$abid, timm:$blgp))],
+ !if(P.HasAbid, mfma_with_abid, mfma_no_abid),
+
// smfmac
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
timm:$cbsz, timm:$abid))]);
}
+class getVOP3MAIScaledPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret = !if(!not(P.IsSMFMAC),
+ // mfma
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+ timm:$cbsz, timm:$blgp,
+ MFMALdScaleModifierOp:$scale_src0_opsel,
+ i32:$scale_src0,
+ MFMALdScaleModifierOp:$scale_src1_opsel,
+ i32:$scale_src1
+ ))],
+ // smfmac
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
+ timm:$cbsz, timm:$abid,
+ MFMALdScaleModifierOp:$scale_src0_opsel,
+ i32:$scale_src0,
+ MFMALdScaleModifierOp:$scale_src1_opsel,
+ i32:$scale_src1))]);
+}
+
class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
bit HasClamp = Clamp;
bit HasOpSel = OpSel;
@@ -1415,7 +1494,7 @@ multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> patter
def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
}
-class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsVOP2 = 0> :
+class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsVOP2 = 0, bit MAIScaled = false> :
VOP3_Pseudo<OpName, P,
!if(P.HasOpSel,
!if(P.HasModifiers,
@@ -1428,7 +1507,7 @@ class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_fr
!if(P.HasIntClamp,
getVOP3ClampPat<P, node>.ret,
!if (P.IsMAI,
- getVOP3MAIPat<P, node>.ret,
+ !if(MAIScaled, getVOP3MAIScaledPat<P, node>.ret, getVOP3MAIPat<P, node>.ret),
getVOP3Pat<P, node>.ret))))),
0, P.HasOpSel> {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 00a3aaf77f9003..9769d8b1d910d2 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -285,6 +285,26 @@ define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloa
ret void
}
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg,
+ i32 immarg, i32, i32 immarg, i32)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
+define amdgpu_kernel void @mfma_scale_f32_16x16x128_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) {
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg,
+ i32 immarg, i32, i32 immarg, i32)
+
+; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
+define amdgpu_kernel void @mfma_f32_scale_32x32x64_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) {
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
+ store <16 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
new file mode 100644
index 00000000000000..9658f8381bff21
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -0,0 +1,2296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+; 0 = fp8
+; 1 = bf8
+; 2 = fp6
+; 3 = bf6
+; 4 = fp4
+
+; --------------------------------------------------------------------
+; Different format signatures
+; --------------------------------------------------------------------
+
+; fp8 x fp8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 1, i32 %scale0, i32 1, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 2, i32 %scale0, i32 2, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 3, i32 %scale0, i32 3, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 3, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 3, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 2, i32 %scale0, i32 3, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 3, i32 %scale0, i32 2, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp8 x bf8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp8 x fp6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp8 x bf6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp8 x fp4
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf8 x fp8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf8 x bf8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf8 x fp6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf8 x bf6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf8 x fp4
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp6 x fp8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp6 x bf8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp6 x fp6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp6 x bf6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+
+; bf6 x fp8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf6 x bf8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf6 x fp6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf6 x fp4
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; bf6 x bf6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp6 x fp4
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp4 x fp8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp4 x bf8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp4 x fp6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp4 x bf6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; fp4 x fp4
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Different input parameter classes
+; --------------------------------------------------------------------
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_mov_b32_e32 v16, s1
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v20, s28
+; SDAG-NEXT: v_mov_b32_e32 v23, v1
+; SDAG-NEXT: v_mov_b32_e32 v22, v0
+; SDAG-NEXT: v_mov_b32_e32 v21, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
+; SDAG-NEXT: v_mov_b32_e32 v12, s0
+; SDAG-NEXT: v_mov_b32_e32 v13, s1
+; SDAG-NEXT: v_mov_b32_e32 v14, s2
+; SDAG-NEXT: v_mov_b32_e32 v15, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: v_mov_b32_e32 v20, s28
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b32_e32 v22, v0
+; GISEL-NEXT: v_mov_b32_e32 v23, v1
+; GISEL-NEXT: v_mov_b32_e32 v21, s29
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_accvgpr_write_b32 a1, s1
+; GCN-NEXT: v_accvgpr_write_b32 a2, s2
+; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v14, s16
+; SDAG-NEXT: v_mov_b32_e32 v15, s17
+; SDAG-NEXT: v_mov_b32_e32 v16, s18
+; SDAG-NEXT: v_mov_b32_e32 v17, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s20
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+ ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x40
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
+ store <4 x float> %result, ptr addrspace(1) %ptr, align 16
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_movk_i32 s6, 0x41
+; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
+ store <4 x float> %result, ptr addrspace(1) %ptr, align 16
+ ret void
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale, with non-0 op_sel arguments.
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large)
+; --------------------------------------------------------------------
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
+
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
new file mode 100644
index 00000000000000..67d887d87dd973
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -0,0 +1,6004 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+; 0 = fp8
+; 1 = bf8
+; 2 = fp6
+; 3 = bf6
+; 4 = fp4
+
+; --------------------------------------------------------------------
+; Different format signatures
+; --------------------------------------------------------------------
+
+; fp8 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 1, i32 %scale0, i32 1, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 2, i32 %scale0, i32 2, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 3, i32 %scale0, i32 3, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 3, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 3, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 2, i32 %scale0, i32 3, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 3, i32 %scale0, i32 2, i32 %scale1)
+ ret <16 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp8 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp8 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp6 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp6 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+
+; bf6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf6 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf6 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf6 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp6 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Different input parameter classes
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: v_mov_b32_e32 v16, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v31, v13
+; SDAG-NEXT: v_mov_b32_e32 v30, v12
+; SDAG-NEXT: v_mov_b32_e32 v29, v11
+; SDAG-NEXT: v_mov_b32_e32 v28, v10
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: v_mov_b32_e32 v34, s2
+; SDAG-NEXT: v_mov_b32_e32 v35, s3
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v31
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v31
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v31, v13
+; SDAG-NEXT: v_mov_b32_e32 v30, v12
+; SDAG-NEXT: v_mov_b32_e32 v29, v11
+; SDAG-NEXT: v_mov_b32_e32 v28, v10
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: v_mov_b32_e32 v34, s2
+; SDAG-NEXT: v_mov_b32_e32 v35, s3
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v31
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v16, s20
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v17, s21
+; GISEL-NEXT: v_mov_b32_e32 v18, s22
+; GISEL-NEXT: v_mov_b32_e32 v19, s23
+; GISEL-NEXT: v_mov_b32_e32 v20, s24
+; GISEL-NEXT: v_mov_b32_e32 v21, s25
+; GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GISEL-NEXT: v_mov_b32_e32 v23, s27
+; GISEL-NEXT: v_mov_b32_e32 v24, s28
+; GISEL-NEXT: v_mov_b32_e32 v25, s29
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v31
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+ ret <16 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v17 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v16, a[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, a[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, a[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[2:3]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
+; GISEL-NEXT: v_mov_b32_e32 v16, s1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
+ store <16 x float> %result, ptr addrspace(1) %ptr, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: s_movk_i32 s2, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v16, a[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, a[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, a[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
+ store <16 x float> %result, ptr addrspace(1) %ptr, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b32_e32 v16, s1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 16
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], 32
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[8:9], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large)
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
+
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
+attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s
index a692693638c692..07d930b0b64dfb 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx950.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx950.s
@@ -2,6 +2,13 @@
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=ERR %s
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefix=ERR %s
+// cbsz = SrcA
+// blgp = SrcB
+
+// 0 = fp8. 1 = bf8
+// 2 = fp6, 3 = bf6
+// 4 = fp4
+
//===----------------------------------------------------------------------===//
// MFMA opcodes.
//===----------------------------------------------------------------------===//
@@ -275,3 +282,612 @@ v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1]
// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0]
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_16x16x128_f8f6f4
+//===----------------------------------------------------------------------===//
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3]
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3]
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 blgp:1
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3]
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x3c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] blgp:1
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[4:11], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x09,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[4:11], a[0:3] cbsz:3
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3]
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:9], a[0:3] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x09,0x02,0x7c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:9], a[0:3] cbsz:1 blgp:3
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_32x32x64_f8f6f4
+//===----------------------------------------------------------------------===//
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15]
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15]
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 blgp:1
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15]
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] blgp:1 ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x3c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] blgp:1
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[4:11], a[0:15] cbsz:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x09,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[4:11], a[0:15] cbsz:3
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15]
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x09,0x02,0x7c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3
+
+//===----------------------------------------------------------------------===//
+// v_mfma_scale_f32_16x16x128_f8f6f4
+//===----------------------------------------------------------------------===//
+// FIXME: Test op_sel, neg, clamp
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], a[12:19], v[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x14]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], a[12:19], v[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x0c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3
+
+//===----------------------------------------------------------------------===//
+// v_mfma_scale_f32_32x32x64_f8f6f4
+//===----------------------------------------------------------------------===//
+// FIXME: Test op_sel, neg, clamp
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], a[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], a[16:23], a[24:31], v[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x14]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], a[24:31], v[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x0c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], v[24:31], a[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 cbsz:3 blgp:1
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 cbsz:2
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x44]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 blgp:2
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_16x16x128_f8f6f4 with appropriate register widths
+//===----------------------------------------------------------------------===//
+
+// bf8 x fp4
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x9c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23] cbsz:1 blgp:4
+
+
+// fp4 x bf8
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x24]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23] cbsz:4 blgp:1
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x3c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23] cbsz:4 blgp:1
+
+
+// bf6 x bf8
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x24]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23] cbsz:3 blgp:1
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x3c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23] cbsz:3 blgp:1
+
+
+// bf8 x bf6
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x64]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23] cbsz:1 blgp:3
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x7c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23] cbsz:1 blgp:3
+
+
+// bf6 x bf6
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x64]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:3
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x7c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:3
+
+// bf6 x fp6
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x44]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:2
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x5c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:2
+
+
+// fp6 x bf6
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x64]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:2 blgp:3
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x7c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:2 blgp:3
+
+
+// fp6 x fp4
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23] cbsz:2 blgp:4
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x9c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23] cbsz:2 blgp:4
+
+
+// fp4 x fp6
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x44]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23] cbsz:4 blgp:2
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x5c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23] cbsz:4 blgp:2
+
+
+// fp4 x fp4
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23] cbsz:4 blgp:4
+
+// GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x9c]
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23] cbsz:4 blgp:4
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_32x32x64_f8f6f4 with appropriate register widths
+//===----------------------------------------------------------------------===//
+
+// bf8 x fp4
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:15], v[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x84]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:15], v[16:31] cbsz:1 blgp:4
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:15], a[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x9c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:15], a[16:31] cbsz:1 blgp:4
+
+
+// fp4 x bf8
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[8:15], v[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x11,0x42,0x24]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[8:15], v[16:31] cbsz:4 blgp:1
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[8:15], a[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x11,0x42,0x3c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[8:15], a[16:31] cbsz:4 blgp:1
+
+
+// bf6 x bf8
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:19], v[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x24]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:19], v[16:31] cbsz:3 blgp:1
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:19], a[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x3c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:19], a[16:31] cbsz:3 blgp:1
+
+// bf8 x bf6
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:17], v[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x64]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:17], v[16:31] cbsz:1 blgp:3
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:17], a[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x7c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:17], a[16:31] cbsz:1 blgp:3
+
+
+// bf6 x bf6
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x64]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:3
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x7c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:3
+
+
+// bf6 x fp6
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x44]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:2
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x5c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:2
+
+
+// fp6 x bf6
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x64]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:2 blgp:3
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x7c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:2 blgp:3
+
+
+// fp6 x fp4
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:15], v[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x84]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:15], v[16:31] cbsz:2 blgp:4
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:15], a[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x9c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:15], a[16:31] cbsz:2 blgp:4
+
+
+// fp4 x fp6
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:17], v[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x44]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:17], v[16:31] cbsz:4 blgp:2
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:17], a[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x5c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:17], a[16:31] cbsz:4 blgp:2
+
+
+// fp4 x fp4
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:15], v[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x84]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:15], v[16:31] cbsz:4 blgp:4
+
+// GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:15], a[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x9c]
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:15], a[16:31] cbsz:4 blgp:4
+
+//===----------------------------------------------------------------------===//
+// v_mfma_scale_f32_16x16x128_f8f6f4 corrected widths
+//===----------------------------------------------------------------------===//
+
+// fp8 x fp8
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 cbsz:0 blgp:0
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 cbsz:0 blgp:0
+
+
+// bf8 x fp8
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 cbsz:1 blgp:0
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x1c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 cbsz:1 blgp:0
+
+// fp8 x bf8
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 cbsz:0 blgp:1
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x3c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 cbsz:0 blgp:1
+
+
+// bf8 x fp4
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 cbsz:1 blgp:4
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x9c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23], v24, v25 cbsz:1 blgp:4
+
+// fp4 x bf8
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x24]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23], v24, v25 cbsz:4 blgp:1
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x3c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23], v24, v25 cbsz:4 blgp:1
+
+
+// bf6 x bf8
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x3c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23], v24, v25 cbsz:3 blgp:1
+
+
+// bf8 x bf6
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 cbsz:1 blgp:3
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x7c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23], v24, v25 cbsz:1 blgp:3
+
+
+// bf6 x bf6
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 cbsz:3 blgp:3
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x7c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 cbsz:3 blgp:3
+
+
+// bf6 x fp6
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x44]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 cbsz:3 blgp:2
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x5c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 cbsz:3 blgp:2
+
+
+// fp6 x bf6
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x64
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 cbsz:2 blgp:3
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x7c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 cbsz:2 blgp:3
+
+
+// fp6 x fp4
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23], v24, v25 cbsz:2 blgp:4
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x9c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23], v24, v25 cbsz:2 blgp:4
+
+// fp4 x fp6
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x44]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23], v24, v25 cbsz:4 blgp:2
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x5c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23], v24, v25 cbsz:4 blgp:2
+
+// fp4 x fp4
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84]
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 cbsz:4 blgp:4
+
+// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x9c]
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23], v24, v25 cbsz:4 blgp:4
+
+
+//===----------------------------------------------------------------------===//
+// v_mfma_scale_f32_32x32x64_f8f6f4 corrected widths
+//===----------------------------------------------------------------------===//
+
+// fp8 x fp8
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 cbsz:0 blgp:0
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 cbsz:0 blgp:0
+
+// bf8 x fp8
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x04]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 cbsz:1 blgp:0
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x1c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 cbsz:1 blgp:0
+
+
+// fp8 x bf8
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x24]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 cbsz:0 blgp:1
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x3c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 cbsz:0 blgp:1
+
+// bf8 x fp4
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x84]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:27], v[32:47], v48, v49 cbsz:1 blgp:4
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x9c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:27], a[32:47], v48, v49 cbsz:1 blgp:4
+
+
+// fp4 x bf8
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x24]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:31], v[32:47], v48, v49 cbsz:4 blgp:1
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x3c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:31], a[32:47], v48, v49 cbsz:4 blgp:1
+
+
+// bf6 x bf8
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 cbsz:3 blgp:1
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x3c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:31], a[32:47], v48, v49 cbsz:3 blgp:1
+
+
+// bf8 x bf6
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x64]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 cbsz:1 blgp:3
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x7c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:29], a[32:47], v48, v49 cbsz:1 blgp:3
+
+// bf6 x bf6
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x64]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 cbsz:3 blgp:3
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x7c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 cbsz:3 blgp:3
+
+// bf6 x fp6
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 cbsz:3 blgp:2
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x5c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 cbsz:3 blgp:2
+
+
+// fp6 x bf6
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 cbsz:2 blgp:3
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x7c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 cbsz:2 blgp:3
+
+
+// fp6 x fp4
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x84]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:27], v[32:47], v48, v49 cbsz:2 blgp:4
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x9c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:27], a[32:47], v48, v49 cbsz:2 blgp:4
+
+
+// fp4 x fp6
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x44]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:29], v[32:47], v48, v49 cbsz:4 blgp:2
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x5c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:29], a[32:47], v48, v49 cbsz:4 blgp:2
+
+
+// fp4 x fp4
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x84]
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:27], v[32:47], v48, v49 cbsz:4 blgp:4
+
+// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c]
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 cbsz:4 blgp:4
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
index 1fa48fca80fb45..b35789cbf500f6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
@@ -159,3 +159,411 @@
# GFX950: v_mfma_ld_scale_b32 vcc_lo, vcc_lo ; encoding: [0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18]
0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18
+
+# GFX950: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
+0x00,0x00,0x80,0xbf
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x9c]
+0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x9c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x7c]
+0x00,0x81,0xad,0xd3,0x04,0x19,0x52,0x7c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c]
+0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x1c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:11], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x3c]
+0x00,0x80,0xad,0xd3,0x04,0x09,0x02,0x3c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[4:9], a[0:3] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xad,0xd3,0x04,0x09,0x02,0x7c]
+0x00,0x81,0xad,0xd3,0x04,0x09,0x02,0x7c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x9c]
+0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x9c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x5c]
+0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x5c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x3c]
+0x00,0x84,0xad,0xd3,0x04,0x19,0x52,0x3c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x9c]
+0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x9c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x7c]
+0x00,0x82,0xad,0xd3,0x04,0x19,0x52,0x7c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x5c]
+0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x5c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x7c]
+0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x7c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x3c]
+0x00,0x83,0xad,0xd3,0x04,0x19,0x52,0x3c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[4:11], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xad,0xd3,0x04,0x09,0x02,0x1c]
+0x00,0x83,0xad,0xd3,0x04,0x09,0x02,0x1c
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84]
+0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x84
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x64]
+0x00,0x01,0xad,0xd3,0x04,0x19,0x52,0x64
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04]
+0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x04
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 ; encoding: [0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x24]
+0x00,0x00,0xad,0xd3,0x04,0x09,0x02,0x24
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x84]
+0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x84
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x44]
+0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x44
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x04,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x84]
+0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x84
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x64]
+0x00,0x02,0xad,0xd3,0x04,0x19,0x52,0x64
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x44]
+0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x44
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x64]
+0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x64
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x03,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x04]
+0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x04
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x24]
+0x00,0x03,0xad,0xd3,0x04,0x09,0x02,0x24
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:15], a[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x9c]
+0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x9c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[12:17], a[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x7c]
+0x00,0x81,0xae,0xd3,0x04,0x19,0x42,0x7c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c]
+0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x1c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:11], a[0:15] blgp:1 ; encoding: [0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x3c]
+0x00,0x80,0xae,0xd3,0x04,0x09,0x02,0x3c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 ; encoding: [0x00,0x81,0xae,0xd3,0x04,0x09,0x02,0x7c]
+0x00,0x81,0xae,0xd3,0x04,0x09,0x02,0x7c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:15], a[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x9c]
+0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x9c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[12:17], a[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x5c]
+0x00,0x84,0xae,0xd3,0x04,0x19,0x42,0x5c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:7], a[8:15], a[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x84,0xae,0xd3,0x04,0x11,0x42,0x3c]
+0x00,0x84,0xae,0xd3,0x04,0x11,0x42,0x3c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:15], a[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x9c]
+0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x9c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x7c]
+0x00,0x82,0xae,0xd3,0x04,0x19,0x42,0x7c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x5c]
+0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x5c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:17], a[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x7c]
+0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x7c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[12:19], a[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x3c]
+0x00,0x83,0xae,0xd3,0x04,0x19,0x42,0x3c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:9], a[4:11], a[0:15] cbsz:3 ; encoding: [0x00,0x83,0xae,0xd3,0x04,0x09,0x02,0x1c]
+0x00,0x83,0xae,0xd3,0x04,0x09,0x02,0x1c
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:15], v[16:31] cbsz:1 blgp:4 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x84]
+0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x84
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[12:17], v[16:31] cbsz:1 blgp:3 ; encoding: [0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x64]
+0x00,0x01,0xae,0xd3,0x04,0x19,0x42,0x64
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04]
+0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x04
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 ; encoding: [0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x24]
+0x00,0x00,0xae,0xd3,0x04,0x09,0x02,0x24
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:15], v[16:31] cbsz:4 blgp:4 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x84]
+0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x84
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[12:17], v[16:31] cbsz:4 blgp:2 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x44]
+0x00,0x04,0xae,0xd3,0x04,0x19,0x42,0x44
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[8:15], v[16:31] cbsz:4 blgp:1 ; encoding: [0x00,0x04,0xae,0xd3,0x04,0x11,0x42,0x24]
+0x00,0x04,0xae,0xd3,0x04,0x11,0x42,0x24
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:15], v[16:31] cbsz:2 blgp:4 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x84]
+0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x84
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:2 blgp:3 ; encoding: [0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x64]
+0x00,0x02,0xae,0xd3,0x04,0x19,0x42,0x64
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:2 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x44]
+0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x44
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:17], v[16:31] cbsz:3 blgp:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x64]
+0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x64
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[12:19], v[16:31] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x24]
+0x00,0x03,0xae,0xd3,0x04,0x19,0x42,0x24
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x04]
+0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x04
+
+# GFX950: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 blgp:1 ; encoding: [0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x24]
+0x00,0x03,0xae,0xd3,0x04,0x09,0x02,0x24
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x9c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x9c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x7c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x7c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x1c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x3c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x3c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x1c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x89,0xad,0xd3,0x04,0x19,0x52,0x1c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x9c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x9c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x5c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x5c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x3c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8c,0xad,0xd3,0x04,0x19,0x52,0x3c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:15], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x9c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x9c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x7c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8a,0xad,0xd3,0x04,0x19,0x52,0x7c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x5c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x5c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:17], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x7c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x7c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:9], a[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x3c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x8b,0xad,0xd3,0x04,0x19,0x52,0x3c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x88,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x1c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x1c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x0c]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x0c
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], a[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x14]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x14
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x44]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x44
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x84]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x84
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x64]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0a,0xad,0xd3,0x04,0x19,0x52,0x64
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x44]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x44
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
+0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x5c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x5c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x3c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x3c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x9c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x9c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x7c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8a,0xae,0xd3,0x10,0x31,0x82,0x7c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x5c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x5c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x7c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x7c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:21], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x3c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8b,0xae,0xd3,0x10,0x31,0x82,0x3c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x9c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x9c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:29], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x7c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x7c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x1c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x3c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x3c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x1c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x89,0xae,0xd3,0x10,0x31,0x82,0x1c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x0c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x0c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x04]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x88,0xae,0xd3,0x10,0x31,0x82,0x04
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], a[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x1c]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x1c
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x84]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x84
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x44]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x44
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:19], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x24]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0c,0xae,0xd3,0x10,0x31,0x82,0x24
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x84]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x84
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x64]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x64
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x04]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x04
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x24
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], a[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x14]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x14
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:27], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x84]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x84
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x44]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x44
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x64]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x64
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x24]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x24
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x04]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x09,0xae,0xd3,0x10,0x31,0x82,0x04
+
+# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
+0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt
new file mode 100644
index 00000000000000..3227b619de4fdd
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_vop3px2.txt
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s
+
+# Check behavior of truncated instruction just to the ld_scale
+
+# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00]
+0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00
+
+# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00]
+0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04]
+0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00,0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04
+
+# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00]
+0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00
+
+# GFX950: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
+0x00,0x00,0x80,0xbf
+
+# GFX950: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; encoding: [0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04]
+0x00,0x80,0xad,0xd3,0x00,0x11,0x02,0x04
+
+
+# GFX950: v_mfma_ld_scale_b32 v20, v21 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x14,0x2b,0x02,0x00]
+0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00
+
+# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00,0x00,0x88,0xad,0xd3,0x00,0x11,0x02,0x04]
+0x00,0x00,0xac,0xd3,0x14,0x2b,0x02,0x00,0x00,0x88,0xad,0xd3,0x00,0x11,0x02,0x04
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
index e601de8d706b44..3ab3319c21ecdf 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
@@ -1,11 +1,13 @@
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
# CHECK: Iterations: 1
-# CHECK: Instructions: 7
-# CHECK: Total Cycles: 42
-# CHECK: Total uOps: 7
+# CHECK: Instructions: 9
+# CHECK: Total Cycles: 44
+# CHECK: Total uOps: 9
v_mfma_ld_scale_b32 v0, v0
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3]
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15]
v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7]
@@ -14,9 +16,10 @@ v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15]
v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
-
# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0
+# CHECK-NEXT: - - - - 1.00 - - v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3]
+# CHECK-NEXT: - - - - 1.00 - - v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15]
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7]
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15]
More information about the llvm-commits
mailing list