[llvm] 05a705e - [AMDGPU] Restrict to VGPR only for mfma scale operands (#158117)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 11 13:10:38 PDT 2025
Author: Changpeng Fang
Date: 2025-09-11T13:10:33-07:00
New Revision: 05a705efda4bf62f54eed1bcb97e212ae823c585
URL: https://github.com/llvm/llvm-project/commit/05a705efda4bf62f54eed1bcb97e212ae823c585
DIFF: https://github.com/llvm/llvm-project/commit/05a705efda4bf62f54eed1bcb97e212ae823c585.diff
LOG: [AMDGPU] Restrict to VGPR only for mfma scale operands (#158117)
Restrict to VGPR only (VRegSrc_32) for mfma scale operands to workaround
a hardware design defect: For all Inline/SGPR constants, SP HW use bits
[30:23] as the scale.
TODO: We may still be able to allow Inline Constants/SGPR, with a proper
shift, to obtain a potentially better performance.
Fixes: SWDEV-548629
Added:
Modified:
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
llvm/test/MC/AMDGPU/mai-gfx950-err.s
llvm/test/MC/AMDGPU/mai-gfx950.s
llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6f778a0d262af..f7279b664ed27 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -966,9 +966,14 @@ class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled =
class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
MAIInst<OpName, BaseInst.Pfl, node, /*Scaled=*/true> {
// Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them.
+ // Restrict to VGPR only (VRegSrc_32) for the scale operands to workaround a
+ // hardware design defect: For all Inline/SGPR constants, SP HW use bits
+ // [30:23] as the scale.
+ // TODO: We may still be able to allow Inline Constants/SGPR, with a proper
+ // shift, to obtain a potentially better performance.
let InOperandList = !con(BaseInst.InOperandList,
- (ins VSrc_b32:$scale_src0,
- VSrc_b32:$scale_src1,
+ (ins VRegSrc_32:$scale_src0,
+ VRegSrc_32:$scale_src1,
op_sel0:$src0_modifiers,
op_sel_hi0:$src1_modifiers));
let AsmOperands =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index f78ea92b4840b..17ae6dd23b199 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1425,9 +1425,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
; GCN-NEXT: v_accvgpr_write_b32 a1, v17
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1447,8 +1448,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GCN-NEXT: v_accvgpr_write_b32 a1, v17
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1468,8 +1470,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GCN-NEXT: v_accvgpr_write_b32 a1, v17
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1567,8 +1570,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1592,8 +1596,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1621,8 +1626,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1646,8 +1652,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1675,8 +1682,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1700,8 +1708,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1721,8 +1730,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b32_e32 v17, s16
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1750,8 +1760,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a1, s21
; SDAG-NEXT: v_accvgpr_write_b32 a2, s22
; SDAG-NEXT: v_accvgpr_write_b32 a3, s23
+; SDAG-NEXT: v_mov_b32_e32 v9, s24
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1775,8 +1786,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: v_accvgpr_write_b32 a1, s21
; GISEL-NEXT: v_accvgpr_write_b32 a2, s22
; GISEL-NEXT: v_accvgpr_write_b32 a3, s23
+; GISEL-NEXT: v_mov_b32_e32 v9, s24
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1789,22 +1801,43 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v20, -2
+; SDAG-NEXT: v_mov_b32_e32 v21, 33
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 33
+; GISEL-NEXT: v_mov_b32_e32 v17, -2
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
ret <4 x float> %result
}
@@ -1813,13 +1846,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v20, -2
+; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1836,8 +1870,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v17, -2
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1853,14 +1888,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1919,9 +1954,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v21, s12
+; SDAG-NEXT: v_mov_b32_e32 v22, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
@@ -1942,9 +1978,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; GISEL-NEXT: v_mov_b32_e32 v20, s29
+; GISEL-NEXT: v_mov_b32_e32 v20, s28
+; GISEL-NEXT: v_mov_b32_e32 v21, s29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
@@ -1960,8 +1997,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: s_movk_i32 s6, 0x41
-; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT: v_mov_b32_e32 v21, -2
+; SDAG-NEXT: v_mov_b32_e32 v22, 0x41
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
@@ -1983,18 +2021,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
-; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
+; GISEL-NEXT: v_mov_b32_e32 v21, -2
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -2007,11 +2046,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2023,8 +2062,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: s_movk_i32 s6, 0x41
-; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT: v_mov_b32_e32 v21, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0x41
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
@@ -2046,18 +2086,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
-; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
+; GISEL-NEXT: v_mov_b32_e32 v21, 1.0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -2070,11 +2111,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2086,8 +2127,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT: v_mov_b32_e32 v21, -2
+; SDAG-NEXT: v_mov_b32_e32 v22, 1.0
; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2108,16 +2151,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; GISEL-NEXT: v_mov_b32_e32 v20, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v21, -2
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -2129,14 +2175,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2148,8 +2192,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; SDAG-NEXT: v_mov_b32_e32 v21, 0.15915494
+; SDAG-NEXT: v_mov_b32_e32 v22, 1.0
; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2170,16 +2216,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50
+; GISEL-NEXT: v_mov_b32_e32 v20, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v21, 0.15915494
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -2191,14 +2240,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2250,43 +2297,85 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v20, 1
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <4 x float> %result
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 1
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
ret <4 x float> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 24af3fa5ff9b7..839f0324227ca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3387,10 +3387,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
; GCN-NEXT: v_accvgpr_write_b32 a12, v28
; GCN-NEXT: v_accvgpr_write_b32 a13, v29
; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
@@ -3436,9 +3437,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GCN-NEXT: v_accvgpr_write_b32 a12, v28
; GCN-NEXT: v_accvgpr_write_b32 a13, v29
; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
@@ -3484,9 +3486,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GCN-NEXT: v_accvgpr_write_b32 a12, v28
; GCN-NEXT: v_accvgpr_write_b32 a13, v29
; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
@@ -3659,8 +3662,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3709,8 +3713,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
@@ -3763,8 +3768,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3813,8 +3819,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
@@ -3867,8 +3874,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3917,8 +3925,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
@@ -3963,8 +3972,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
; GCN-NEXT: v_accvgpr_write_b32 a13, s25
; GCN-NEXT: v_accvgpr_write_b32 a14, s26
; GCN-NEXT: v_accvgpr_write_b32 a15, s27
+; GCN-NEXT: v_mov_b32_e32 v17, s28
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
@@ -4114,48 +4124,95 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, -2
+; SDAG-NEXT: v_mov_b32_e32 v32, 33
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 33
+; GISEL-NEXT: v_mov_b32_e32 v32, -2
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
ret <16 x float> %result
}
@@ -4165,7 +4222,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v31, -2
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
@@ -4183,7 +4241,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4210,6 +4268,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, -2
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -4227,7 +4286,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
@@ -4257,7 +4316,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
@@ -4275,7 +4335,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, 1.0 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4302,6 +4362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -4319,7 +4380,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, 1.0 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
@@ -4345,106 +4406,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1.0, -2 op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0.15915494, 1.0 op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: s_movk_i32 s0, 0x41
-; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v31, -2
+; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
@@ -4462,7 +4429,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4484,12 +4451,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d
+; GISEL-NEXT: v_mov_b32_e32 v31, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v32, -2
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -4528,44 +4495,233 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
ret <16 x float> %result
}
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: v_mov_b32_e32 v20, s12
-; SDAG-NEXT: v_mov_b32_e32 v21, s13
-; SDAG-NEXT: v_mov_b32_e32 v22, s14
-; SDAG-NEXT: v_mov_b32_e32 v23, s15
-; SDAG-NEXT: v_mov_b32_e32 v24, s16
-; SDAG-NEXT: v_mov_b32_e32 v25, s17
-; SDAG-NEXT: v_mov_b32_e32 v26, s18
-; SDAG-NEXT: v_mov_b32_e32 v27, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s20
-; SDAG-NEXT: v_mov_b32_e32 v29, s21
-; SDAG-NEXT: v_mov_b32_e32 v30, s22
-; SDAG-NEXT: v_mov_b32_e32 v31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494
+; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+ ret <16 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; SDAG-NEXT: v_mov_b32_e32 v32, s1
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
@@ -4598,9 +4754,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_mov_b32_e32 v32, s1
+; GISEL-NEXT: v_mov_b32_e32 v32, s0
+; GISEL-NEXT: v_mov_b32_e32 v33, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
@@ -4620,7 +4777,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: s_movk_i32 s2, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v32, -2
+; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s8
@@ -4648,7 +4806,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
@@ -4664,6 +4822,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
@@ -4683,7 +4842,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
@@ -4738,9 +4897,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: v_mov_b32_e32 v1, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
; SDAG-NEXT: v_mov_b32_e32 v2, s20
; SDAG-NEXT: v_mov_b32_e32 v3, s21
; SDAG-NEXT: v_mov_b32_e32 v4, s22
@@ -4811,10 +4971,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b32_e32 v20, s1
+; GISEL-NEXT: v_mov_b32_e32 v20, s0
+; GISEL-NEXT: v_mov_b32_e32 v21, s1
; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
@@ -4852,24 +5013,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, 42
+; SDAG-NEXT: v_mov_b32_e32 v1, 25
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
@@ -4888,7 +5051,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v2, s20
; SDAG-NEXT: v_mov_b32_e32 v3, s21
; SDAG-NEXT: v_mov_b32_e32 v4, s22
@@ -4931,9 +5094,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v20, 25
+; GISEL-NEXT: v_mov_b32_e32 v21, 42
; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
@@ -4959,14 +5122,15 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
@@ -4978,7 +5142,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
@@ -5123,6 +5287,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v32, 42
+; SDAG-NEXT: v_mov_b32_e32 v33, 25
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
@@ -5151,7 +5317,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
@@ -5195,9 +5361,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b32_e32 v32, 25
+; GISEL-NEXT: v_mov_b32_e32 v33, 42
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
@@ -5215,10 +5381,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
@@ -5234,7 +5401,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
@@ -5345,95 +5512,189 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 1
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0
+; GISEL-NEXT: v_mov_b32_e32 v32, 1
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 0
+; SDAG-NEXT: v_mov_b32_e32 v32, 1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 1
+; GISEL-NEXT: v_mov_b32_e32 v32, 0
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
ret <16 x float> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
index 4585eca8fe894..c01c2be23b83f 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir
@@ -157,19 +157,19 @@ name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
...
@@ -180,18 +180,18 @@ name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
...
@@ -202,19 +202,19 @@ name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
...
@@ -225,18 +225,18 @@ name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
- renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
...
@@ -247,18 +247,18 @@ name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 3
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
- renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
...
@@ -269,17 +269,17 @@ name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
- ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec
- renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
...
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
index e700b0b3cabfe..5c9dbd7f7636f 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx950-err.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
@@ -156,3 +156,51 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:9], v[0:3] v20, v21 blgp
v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:4
// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4
+
+
+// Workaround a hardware bug to disallow sgpr/inline constants as scale operands
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24
+// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 9
+// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 4.0
+// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, v24
+// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49
+// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0
+// CHECK: :[[@LINE-1]]:78: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, v24
+// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, v24
+// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s
index 2d3a56703674a..c9035033912ac 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx950.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx950.s
@@ -405,58 +405,6 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25
// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0
-
-// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16
-
// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24]
// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1
@@ -585,22 +533,6 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49
// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49
-
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0
-
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0
-
-// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16
-
// op_sel combinations
// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
index 77b87ac63f335..e191455beb64d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
@@ -392,27 +392,6 @@
# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64]
0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
@@ -422,15 +401,6 @@
# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24]
0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84]
0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84
@@ -467,18 +437,6 @@
# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
-# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04]
-0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04
-
# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c]
0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c
@@ -581,18 +539,6 @@
# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
-# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
-# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
-# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04]
-0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04
-
# GFX950: v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c]
0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c
More information about the llvm-commits
mailing list