[llvm] [AMDGPU] Add gfx1250 v_wmma_scale[16]_f32_16x16x128_f8f6f4 MC support (PR #152014)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 4 12:01:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
This adds new VOP3PX2e encoding
---
Patch is 58.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152014.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+7)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp (+3-2)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+73-1)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll (+41-38)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s (+272-2)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt (+138)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ffe6b0649cb94..fef0d7eb45a8c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -598,6 +598,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
// encodings
+ if (isGFX1250() && Bytes.size() >= 16) {
+ DecoderUInt128 DecW = eat16Bytes(Bytes);
+ if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
+ break;
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+ }
+
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index ffdac8b8ce324..fa0c95f54d9e7 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -75,8 +75,9 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
if (STI->hasFeature(AMDGPU::FeatureNSAEncoding))
return 20;
- // VOP3PX encoding.
- if (STI->hasFeature(AMDGPU::FeatureGFX950Insts))
+ // VOP3PX/VOP3PX2 encoding.
+ if (STI->hasFeature(AMDGPU::FeatureGFX950Insts) ||
+ STI->hasFeature(AMDGPU::FeatureGFX1250Insts))
return 16;
// 64-bit instruction with 32-bit literal.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 457c0eed4f047..c733c3c5235b4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1780,6 +1780,8 @@ multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixRe
}
defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>;
+defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<1, 0, 1>;
+defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>;
class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> {
let HasMatrixScale = 1;
@@ -1844,7 +1846,8 @@ defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64
defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
-
+defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">;
+defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">;
} // End is_wmma_xdl = 1.
defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
@@ -2138,6 +2141,73 @@ multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
}
}
+class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VOP3Pe_Base {
+ bits<9> scale_src0;
+ bits<9> scale_src1;
+
+ // Inst{7-0} = unused
+ let Inst{10-8} = {0, matrix_b_scale_fmt{1-0}}; // neg_hi
+ let Inst{11} = matrix_a_scale{0}; // scale_op_sel(0)
+ let Inst{12} = 0; // scale_op_sel(1)
+ let Inst{13} = matrix_a_reuse; // scale_op_sel(2)
+ let Inst{14} = matrix_b_reuse; // scale_op_sel_hi(2)
+ let Inst{15} = 0; // scale_clamp
+ let Inst{31-24} = 0xcc; // Encoding
+ let Inst{23-16} = LdScaleOp;
+ let Inst{40-32} = scale_src0;
+ let Inst{49-41} = scale_src1;
+ let Inst{58-50} = 0; // scale src2
+ let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0)
+ let Inst{60} = 0; // scale_op_sel_hi(1)
+ let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
+
+ // The high half of the encoding is the unscaled wmma op.
+ let Inst{71-64} = vdst;
+
+ let Inst{72} = !if(P.NegHi01, src0_modifiers{1}, 0); // neg_hi src0
+ let Inst{73} = !if(P.NegHi01, src1_modifiers{1}, 0); // neg_hi src1
+ let Inst{74} = !if(P.NegHi2, src2_modifiers{1}, 0); // neg_hi src2
+
+ let Inst{77-75} = !if(P.HasMatrixFMT, matrix_a_fmt{2-0}, 0); // op_sel
+
+ let Inst{78,124,123} = !if(P.HasMatrixFMT, matrix_b_fmt{2-0}, 7); // op_sel_hi
+ let Inst{79} = !if(P.HasClamp, clamp{0}, 0);
+
+ let Inst{87-80} = op;
+ let Inst{95-88} = 0xcc; //encoding
+ let Inst{104-96} = !if(P.HasSrc0, src0, 0);
+ let Inst{113-105} = !if(P.HasSrc1, src1, 0);
+ let Inst{122-114} = !if(P.HasSrc2, src2, 0);
+
+ // neg_lo
+ let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0);
+ let Inst{126} = !if(P.NegLo01, src1_modifiers{0}, 0);
+ let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
+}
+
+multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
+ DecoderNamespace = "GFX1250" in {
+ def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>,
+ VOP3PX2e <op, LdScaleOp, WMMAP>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> {
+ let AsmString = asmName # PS.AsmOperands;
+ }
+ }
+}
+
+multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> {
+ defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+ foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ let isAsmParserOnly = true in { // Disable ambiguous disassembly.
+ defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+ }
+ }
+}
+
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -2213,6 +2283,8 @@ defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8B
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
+defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
+defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 2ad7818bd3ca8..243f0ed3a8d0d 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -25,8 +25,11 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr
; GCN-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_cmp_eq_u32 s0, 0
-; GCN-NEXT: s_cbranch_scc1 .LBB0_2
-; GCN-NEXT: ; %bb.1: ; %bb2
+; GCN-NEXT: s_cbranch_scc0 .LBB0_1
+; GCN-NEXT: ; %bb.3: ; %bb
+; GCN-NEXT: s_add_pc_i64 .LBB0_2-.Lpost_addpc0
+; GCN-NEXT: .Lpost_addpc0:
+; GCN-NEXT: .LBB0_1: ; %bb2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
; GCN-NEXT: v_nop_e64
@@ -64,8 +67,8 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs
; GCN-NEXT: s_cmp_eq_u32 s0, 0
; GCN-NEXT: s_cbranch_scc0 .LBB1_1
; GCN-NEXT: ; %bb.3: ; %bb0
-; GCN-NEXT: s_add_pc_i64 .LBB1_2-.Lpost_addpc0
-; GCN-NEXT: .Lpost_addpc0:
+; GCN-NEXT: s_add_pc_i64 .LBB1_2-.Lpost_addpc1
+; GCN-NEXT: .Lpost_addpc1:
; GCN-NEXT: .LBB1_1: ; %bb2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
@@ -106,8 +109,8 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr
; GCN-NEXT: s_cmp_eq_f32 s0, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_1
; GCN-NEXT: ; %bb.3: ; %bb0
-; GCN-NEXT: s_add_pc_i64 .LBB2_2-.Lpost_addpc1
-; GCN-NEXT: .Lpost_addpc1:
+; GCN-NEXT: s_add_pc_i64 .LBB2_2-.Lpost_addpc2
+; GCN-NEXT: .Lpost_addpc2:
; GCN-NEXT: .LBB2_1: ; %bb2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; 32 bytes
@@ -157,8 +160,8 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2
; GCN-NEXT: s_cbranch_execnz .LBB3_1
; GCN-NEXT: ; %bb.3: ; %bb
-; GCN-NEXT: s_add_pc_i64 .LBB3_2-.Lpost_addpc2
-; GCN-NEXT: .Lpost_addpc2:
+; GCN-NEXT: s_add_pc_i64 .LBB3_2-.Lpost_addpc3
+; GCN-NEXT: .Lpost_addpc3:
; GCN-NEXT: .LBB3_1: ; %bb2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; 32 bytes
@@ -209,8 +212,8 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
; GCN-NEXT: s_cbranch_scc0 .LBB4_2
; GCN-NEXT: ; %bb.3: ; %bb2
; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1
-; GCN-NEXT: s_add_pc_i64 .LBB4_1-.Lpost_addpc3
-; GCN-NEXT: .Lpost_addpc3:
+; GCN-NEXT: s_add_pc_i64 .LBB4_1-.Lpost_addpc4
+; GCN-NEXT: .Lpost_addpc4:
; GCN-NEXT: .LBB4_2: ; %bb3
; GCN-NEXT: s_endpgm
bb:
@@ -242,8 +245,8 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
; GCN-NEXT: s_mov_b32 s0, -1
; GCN-NEXT: s_cbranch_scc0 .LBB5_1
; GCN-NEXT: ; %bb.7: ; %bb0
-; GCN-NEXT: s_add_pc_i64 .LBB5_4-.Lpost_addpc5
-; GCN-NEXT: .Lpost_addpc5:
+; GCN-NEXT: s_add_pc_i64 .LBB5_4-.Lpost_addpc6
+; GCN-NEXT: .Lpost_addpc6:
; GCN-NEXT: .LBB5_1: ; %Flow
; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GCN-NEXT: s_cbranch_vccnz .LBB5_3
@@ -268,11 +271,11 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
; GCN-NEXT: s_sleep 0
; GCN-NEXT: s_cbranch_execnz .LBB5_5
; GCN-NEXT: ; %bb.9: ; %bb3
-; GCN-NEXT: s_add_pc_i64 .LBB5_2-.Lpost_addpc6
-; GCN-NEXT: .Lpost_addpc6:
+; GCN-NEXT: s_add_pc_i64 .LBB5_2-.Lpost_addpc7
+; GCN-NEXT: .Lpost_addpc7:
; GCN-NEXT: .LBB5_5: ; %bb3
-; GCN-NEXT: s_add_pc_i64 .LBB5_3-.Lpost_addpc4
-; GCN-NEXT: .Lpost_addpc4:
+; GCN-NEXT: s_add_pc_i64 .LBB5_3-.Lpost_addpc5
+; GCN-NEXT: .Lpost_addpc5:
bb0:
%tmp = icmp ne i32 %arg1, 0
br i1 %tmp, label %bb2, label %bb3
@@ -310,8 +313,8 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad
; GCN-NEXT: s_cbranch_vccz .LBB6_2
; GCN-NEXT: ; %bb.3: ; %loop
; GCN-NEXT: ; in Loop: Header=BB6_1 Depth=1
-; GCN-NEXT: s_add_pc_i64 .LBB6_1-.Lpost_addpc7
-; GCN-NEXT: .Lpost_addpc7:
+; GCN-NEXT: s_add_pc_i64 .LBB6_1-.Lpost_addpc8
+; GCN-NEXT: .Lpost_addpc8:
; GCN-NEXT: .LBB6_2: ; %DummyReturnBlock
; GCN-NEXT: s_endpgm
entry:
@@ -350,8 +353,8 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GCN-NEXT: s_cbranch_vccz .LBB7_3
; GCN-NEXT: ; %bb.5: ; %Flow
-; GCN-NEXT: s_add_pc_i64 .LBB7_4-.Lpost_addpc8
-; GCN-NEXT: .Lpost_addpc8:
+; GCN-NEXT: s_add_pc_i64 .LBB7_4-.Lpost_addpc9
+; GCN-NEXT: .Lpost_addpc9:
; GCN-NEXT: .LBB7_3: ; %bb2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
@@ -406,8 +409,8 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
; GCN-NEXT: v_cmpx_gt_u32_e32 16, v0
; GCN-NEXT: s_cbranch_execnz .LBB8_1
; GCN-NEXT: ; %bb.4: ; %entry
-; GCN-NEXT: s_add_pc_i64 .LBB8_3-.Lpost_addpc9
-; GCN-NEXT: .Lpost_addpc9:
+; GCN-NEXT: s_add_pc_i64 .LBB8_3-.Lpost_addpc10
+; GCN-NEXT: .Lpost_addpc10:
; GCN-NEXT: .LBB8_1: ; %if
; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
@@ -465,8 +468,8 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
; GCN-NEXT: s_and_not1_saveexec_b32 s0, s0
; GCN-NEXT: s_cbranch_execnz .LBB9_3
; GCN-NEXT: ; %bb.6: ; %Flow1
-; GCN-NEXT: s_add_pc_i64 .LBB9_5-.Lpost_addpc10
-; GCN-NEXT: .Lpost_addpc10:
+; GCN-NEXT: s_add_pc_i64 .LBB9_5-.Lpost_addpc11
+; GCN-NEXT: .Lpost_addpc11:
; GCN-NEXT: .LBB9_3: ; %loop.preheader
; GCN-NEXT: s_mov_b32 vcc_lo, 0
; GCN-NEXT: .LBB9_4: ; %loop
@@ -484,8 +487,8 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
; GCN-NEXT: s_cbranch_vccnz .LBB9_5
; GCN-NEXT: ; %bb.8: ; %loop
; GCN-NEXT: ; in Loop: Header=BB9_4 Depth=1
-; GCN-NEXT: s_add_pc_i64 .LBB9_4-.Lpost_addpc11
-; GCN-NEXT: .Lpost_addpc11:
+; GCN-NEXT: s_add_pc_i64 .LBB9_4-.Lpost_addpc12
+; GCN-NEXT: .Lpost_addpc12:
; GCN-NEXT: .LBB9_5: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
entry:
@@ -528,20 +531,20 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GCN-NEXT: s_cmp_lt_i32 s3, 6
; GCN-NEXT: s_cbranch_scc0 .LBB10_1
; GCN-NEXT: ; %bb.10: ; %bb
-; GCN-NEXT: s_add_pc_i64 .LBB10_4-.Lpost_addpc13
-; GCN-NEXT: .Lpost_addpc13:
+; GCN-NEXT: s_add_pc_i64 .LBB10_4-.Lpost_addpc14
+; GCN-NEXT: .Lpost_addpc14:
; GCN-NEXT: .LBB10_1: ; %Flow
; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7
; GCN-NEXT: s_cbranch_vccnz .LBB10_2
; GCN-NEXT: ; %bb.12: ; %Flow
-; GCN-NEXT: s_add_pc_i64 .LBB10_5-.Lpost_addpc14
-; GCN-NEXT: .Lpost_addpc14:
+; GCN-NEXT: s_add_pc_i64 .LBB10_5-.Lpost_addpc15
+; GCN-NEXT: .Lpost_addpc15:
; GCN-NEXT: .LBB10_2: ; %Flow5
; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GCN-NEXT: s_cbranch_vccz .LBB10_3
; GCN-NEXT: ; %bb.14: ; %Flow5
-; GCN-NEXT: s_add_pc_i64 .LBB10_6-.Lpost_addpc15
-; GCN-NEXT: .Lpost_addpc15:
+; GCN-NEXT: s_add_pc_i64 .LBB10_6-.Lpost_addpc16
+; GCN-NEXT: .Lpost_addpc16:
; GCN-NEXT: .LBB10_3: ; %bb14
; GCN-NEXT: s_cmp_lt_i32 s1, 9
; GCN-NEXT: s_cselect_b32 s0, -1, 0
@@ -553,8 +556,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-NEXT: ; %bb.8: ; %bb14
-; GCN-NEXT: s_add_pc_i64 .LBB10_7-.Lpost_addpc12
-; GCN-NEXT: .Lpost_addpc12:
+; GCN-NEXT: s_add_pc_i64 .LBB10_7-.Lpost_addpc13
+; GCN-NEXT: .Lpost_addpc13:
; GCN-NEXT: .LBB10_4: ; %bb13
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
@@ -565,8 +568,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GCN-NEXT: s_sleep 0
; GCN-NEXT: s_cbranch_execz .LBB10_5
; GCN-NEXT: ; %bb.16: ; %bb13
-; GCN-NEXT: s_add_pc_i64 .LBB10_2-.Lpost_addpc16
-; GCN-NEXT: .Lpost_addpc16:
+; GCN-NEXT: s_add_pc_i64 .LBB10_2-.Lpost_addpc17
+; GCN-NEXT: .Lpost_addpc17:
; GCN-NEXT: .LBB10_5: ; %bb9
; GCN-NEXT: s_cmp_lt_i32 s3, 11
; GCN-NEXT: s_cselect_b32 s0, -1, 0
@@ -577,8 +580,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GCN-NEXT: s_cbranch_vccnz .LBB10_6
; GCN-NEXT: ; %bb.18: ; %bb9
-; GCN-NEXT: s_add_pc_i64 .LBB10_3-.Lpost_addpc17
-; GCN-NEXT: .Lpost_addpc17:
+; GCN-NEXT: s_add_pc_i64 .LBB10_3-.Lpost_addpc18
+; GCN-NEXT: .Lpost_addpc18:
; GCN-NEXT: .LBB10_6:
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: .LBB10_7: ; %bb19
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index 309c74ae7ff7a..93e65d3444b89 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -983,6 +983,11 @@ v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
v_wmma_ld_scale_paired_b32 v1, v2
// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
@@ -1153,8 +1158,273 @@ v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 mat
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
-// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP8
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: er...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152014
More information about the llvm-commits
mailing list