[llvm] 57216f7 - [AMDGPU] Support byte_sel modifier for v_cvt_f32_fp8 and v_cvt_f32_bf8 (#90887)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 2 12:03:56 PDT 2024
Author: Stanislav Mekhanoshin
Date: 2024-05-02T12:03:51-07:00
New Revision: 57216f7bd64a78af6e037b34662ed49a6221734b
URL: https://github.com/llvm/llvm-project/commit/57216f7bd64a78af6e037b34662ed49a6221734b
DIFF: https://github.com/llvm/llvm-project/commit/57216f7bd64a78af6e037b34662ed49a6221734b.diff
LOG: [AMDGPU] Support byte_sel modifier for v_cvt_f32_fp8 and v_cvt_f32_bf8 (#90887)
Added:
Modified:
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP1Instructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4d036fdea63b4d..442a9c12b42f24 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8634,8 +8634,8 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
- assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
- Inst.addOperand(Inst.getOperand(0));
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
+ Inst.addOperand(Inst.getOperand(0));
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyByteSel);
}
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 91733c2933b4c9..0ac079c69e605f 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -140,7 +140,8 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
!hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
!hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
- !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) {
LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 76b90042d65f7d..6599d0abd135cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4459,7 +4459,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
// Check output modifiers
return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
- !hasModifiersSet(MI, AMDGPU::OpName::clamp);
+ !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
+ !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
}
// Set VCC operand with all flags from \p Orig, except for setting it as
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7a8b6c98fc3657..6c16811b7dd64c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2306,8 +2306,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsWMMA = 0;
field bit IsSWMMAC = 0;
- field bit IsFP8 = 0;
+ field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
+ field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
@@ -2427,7 +2428,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
- HasModifiers, DstVT, IsFP8DstByteSel>.ret;
+ HasModifiers, DstVT, IsFP8ByteSel>.ret;
field string Asm64 = AsmVOP3Base;
field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2341e0d9d32bb4..0efb95d7d15a11 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -625,42 +625,44 @@ def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
let HasExtVOP3DPP = 0;
}
-def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
- let HasOpSel = 1;
+class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
+ let IsFP8SrcByteSel = 1;
+ let HasOpSel = 0;
let HasExtDPP = 1;
let HasExtVOP3DPP = 1;
- let IsFP8 = 1;
+ let HasExtSDWA = 0;
let HasClamp = 0;
let HasOMod = 0;
- let HasModifiers = 1;
- let Src1VOP3DPP = Src1RC64;
+ let HasModifiers = 0;
+
+ defvar bytesel = (ins ByteSel:$byte_sel);
+ let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+ bytesel);
+ let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, Src2VOP3DPP,
+ NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
+ Src2ModVOP3DPP, HasOpSel>.ret,
+ bytesel);
}
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
- defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
- defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+ defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
+ defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
}
-class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
- VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
- (f32 (node i32:$src, index)),
- !if (index,
- (inst_e64 !or(!if(index{0}, SRCMODS.OP_SEL_1, 0),
- !if(index{1}, SRCMODS.OP_SEL_0, 0)),
- $src, 0),
- (inst_e32 $src))
+class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
+ (node i32:$src0, timm:$byte_sel),
+ (inst $src0, (as_i32timm $byte_sel))
>;
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
- foreach Index = [0, 1, 2, 3] in {
- def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
- V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
- def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
- V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
- }
+ def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
+ def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
}
class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
@@ -901,14 +903,11 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
+defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+
// Define VOP1 instructions using the pseudo instruction with its old profile and
// VOP3 using the OpSel profile for the pseudo instruction.
-defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
-defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
-
-defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
-defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
-
defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 7cdb5cbfe297d5..74988ac634c31e 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -306,9 +306,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
-class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
- let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
- let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
+class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ bits<2> byte_sel;
+ let Inst{11-12} = byte_sel; // NB: bit order is intentionally reversed!
+ let Inst{14-13} = 0; // op_sel2/3
}
class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
@@ -755,10 +756,14 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
- let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
- let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
- let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
- let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
+ let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
+ !if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
+ let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
+ !if(P.IsFP8SrcByteSel, byte_sel{0}, ?));
+ let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0),
+ !if(P.IsFP8DstByteSel, byte_sel{0}, ?));
+ let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0),
+ !if(P.IsFP8DstByteSel, byte_sel{1}, ?));
let Inst{15} = !if(P.HasClamp, clamp, 0);
let Inst{25-16} = op;
let Inst{31-26} = 0x35;
@@ -1397,7 +1402,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- if ps.Pfl.IsFP8DstByteSel then {
+ if ps.Pfl.IsFP8SrcByteSel then {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
+ } else if ps.Pfl.IsFP8DstByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
@@ -1428,10 +1437,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands,
IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- if ps.Pfl.IsFP8 then {
+ if ps.Pfl.IsFP8SrcByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
- VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+ VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
} else if ps.Pfl.IsFP8DstByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index ffedde9416bb26..1139271003119c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -14,9 +14,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
@@ -26,9 +24,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte2:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
@@ -38,9 +34,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-LABEL: test_cvt_f32_fp8_byte3:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
index d11fb27640ee75..7e286a4dd678eb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
@@ -13,12 +13,12 @@ body: |
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]]
+ ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
+ ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_dpp]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
- %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e32 killed %1, implicit $mode, implicit $exec
$vgpr0 = COPY %2
SI_RETURN_TO_EPILOG $vgpr0
@@ -34,13 +34,13 @@ body: |
; GFX12: liveins: $vgpr0
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
- ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]]
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 2, 228, 15, 15, 1, implicit $mode, implicit $exec
+ ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
- %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 killed %1, 2, implicit $mode, implicit $exec
$vgpr0 = COPY %2
SI_RETURN_TO_EPILOG $vgpr0
@@ -56,13 +56,13 @@ body: |
; GFX12: liveins: $vgpr0
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
- ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]]
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 3, 228, 15, 15, 1, implicit $mode, implicit $exec
+ ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
- %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 killed %1, 3, implicit $mode, implicit $exec
$vgpr0 = COPY %2
SI_RETURN_TO_EPILOG $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 7662a3b78dea23..d3fc96d7ff8012 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -47,7 +47,7 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
ret float %ret
@@ -67,7 +67,7 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
ret float %ret
@@ -87,7 +87,7 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
ret float %ret
@@ -127,7 +127,7 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
ret float %ret
@@ -147,7 +147,7 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
ret float %ret
@@ -167,7 +167,7 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
ret float %ret
@@ -552,7 +552,7 @@ define float @test_sext_cvt_f32_fp8(i16 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
@@ -576,7 +576,7 @@ define float @test_sext_cvt_f32_bf8(i16 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index 662905809ad9fe..103fa67064ef1b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -399,73 +399,73 @@ v_ctz_i32_b32_e64 v255, 0xaf123456
v_cvt_f32_bf8_e64 v1, s3
// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1]
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:1
// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0]
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:2
// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1]
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:3
// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
v_cvt_f32_bf8_e64 v1, 3
// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1]
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:1
// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0]
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:2
// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1]
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:3
// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
v_cvt_f32_bf8_e64 v1, v3
// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1]
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:1
// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0]
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:2
// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
-v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1]
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:3
// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
v_cvt_f32_fp8_e64 v1, s3
// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1]
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:1
// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0]
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:2
// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1]
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:3
// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
v_cvt_f32_fp8_e64 v1, 3
// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1]
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:1
// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0]
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:2
// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1]
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:3
// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
v_cvt_f32_fp8_e64 v1, v3
// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1]
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:1
// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0]
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:2
// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
-v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1]
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:3
// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
v_cvt_pk_f32_bf8_e64 v[2:3], s3
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 706e9e3717dcc7..ae1381a5a72932 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -516,6 +516,36 @@ v_cvt_f32_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
v_cvt_f32_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: [0xff,0x81,0x8b,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_cvt_f32_fp8 v1, v2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd8,0x02,0x7e,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:0 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:1 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:3 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xda,0x02,0x7e,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:0 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:1 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:3 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
v_cvt_f32_i32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
// GFX12: [0x05,0x00,0x85,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index 002cd778a99769..d88922c111f604 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -144,6 +144,36 @@ v_cvt_f32_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_cvt_f32_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX12: [0xff,0x81,0x8b,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_cvt_f32_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
v_cvt_f32_i32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x85,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index ac745c5433244a..f9c768e3e02665 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -342,12 +342,42 @@
# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
+# GFX12: v_cvt_f32_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd8,0x02,0x7e,0x02,0xe4,0x00,0xff]
+0xfa,0xd8,0x02,0x7e,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x10,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x08,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x18,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d]
0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d
# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
+# GFX12: v_cvt_f32_bf8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xda,0x02,0x7e,0x02,0xe4,0x00,0xff]
+0xfa,0xda,0x02,0x7e,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x10,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x08,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x18,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
# GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index fdeda3bb272da6..eccd691855774e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -78,12 +78,42 @@
# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
+# GFX12: v_cvt_f32_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x02,0x77,0x39,0x05]
+0xe9,0xd8,0x02,0x7e,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x10,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x08,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x18,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa
# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
+# GFX12: v_cvt_f32_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x02,0x77,0x39,0x05]
+0xe9,0xda,0x02,0x7e,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x10,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x08,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x18,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
# GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
More information about the llvm-commits
mailing list