[llvm] 6e722bb - [AMDGPU] Support byte_sel modifier on v_cvt_sr_fp8_f32 and v_cvt_sr_bf8_f32 (#90244)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 26 13:03:01 PDT 2024
Author: Stanislav Mekhanoshin
Date: 2024-04-26T13:02:57-07:00
New Revision: 6e722bbe30bd7d44f8b1dbf70e9f341a7c7e65ff
URL: https://github.com/llvm/llvm-project/commit/6e722bbe30bd7d44f8b1dbf70e9f341a7c7e65ff
DIFF: https://github.com/llvm/llvm-project/commit/6e722bbe30bd7d44f8b1dbf70e9f341a7c7e65ff.diff
LOG: [AMDGPU] Support byte_sel modifier on v_cvt_sr_fp8_f32 and v_cvt_sr_bf8_f32 (#90244)
Added:
Modified:
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 38667235211471..c4ec7a7befd47e 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -172,6 +172,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
ImmTyWaitEXP,
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
+ ImmTyByteSel,
};
// Immediate operand kind.
@@ -410,6 +411,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
bool isNegHi() const { return isImmTy(ImmTyNegHi); }
+ bool isByteSel() const {
+ return isImmTy(ImmTyByteSel) && isUInt<2>(getImm());
+ }
bool isRegOrImm() const {
return isReg() || isImm();
@@ -1139,6 +1143,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
case ImmTyWaitEXP: OS << "WaitEXP"; break;
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
+ case ImmTyByteSel: OS << "ByteSel" ; break;
}
// clang-format on
}
@@ -8644,6 +8649,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}
}
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
+ assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
+ Inst.addOperand(Inst.getOperand(0));
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyByteSel);
+ }
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClampSI);
@@ -8680,8 +8692,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
- Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
Inst.addOperand(Inst.getOperand(0));
}
@@ -8692,7 +8704,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
- Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
}
@@ -9207,10 +9223,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
Inst.addOperand(Inst.getOperand(0));
}
- bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
+ bool IsVOP3CvtSrDpp =
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12;
if (IsVOP3CvtSrDpp) {
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
Inst.addOperand(MCOperand::createImm(0));
@@ -9243,6 +9260,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
llvm_unreachable("unhandled operand type");
}
}
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyByteSel);
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index dc1bf92771b443..8fd36b84a00cd8 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -869,10 +869,6 @@ void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
- if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
- MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
- insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
-
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -902,10 +898,6 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
- if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
- MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
- insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
-
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 5090b0a07da4b0..91733c2933b4c9 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -409,6 +409,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
DPPInst.addImm(NegHiOpr->getImm());
}
+ auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
+ if (ByteSelOpr &&
+ AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
+ DPPInst.addImm(ByteSelOpr->getImm());
+ }
}
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index b6a95906bc45c6..883b6c4407fe59 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1806,4 +1806,14 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
O << ' ' << formatDec(Imm);
}
+void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint8_t Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " byte_sel:" << formatDec(Imm);
+}
+
#include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index c801eaf1111e2f..d6d7fd34b68cc1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -186,6 +186,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpTgt(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
public:
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f1afbcc060b261..bf6cfe90ebfbee 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1123,6 +1123,8 @@ def WaitEXP : NamedIntOperand<i8, "wait_exp">;
def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst">;
def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc">;
+def ByteSel : NamedIntOperand<i8, "byte_sel">;
+
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -1700,9 +1702,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod0:$clamp, omod0:$omod),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod0:$clamp))
+ !con((ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1),
+ !if(HasClamp, (ins clampmod0:$clamp), (ins))))
/* else */,
// VOP2 without modifiers
!if (HasClamp,
@@ -2036,7 +2038,8 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
bit HasOpSel, bit HasOMod, bit IsVOP3P,
bit HasModifiers, bit Src0HasMods,
- bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
+ bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
+ bit HasByteSel = 0> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
"$sdst",
@@ -2058,6 +2061,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
string src1 = !if(Src1HasMods, src1mods, src1nomods);
string src2 = !if(Src2HasMods, src2mods, src2nomods);
string opsel = !if(HasOpSel, "$op_sel", "");
+ string bytesel = !if(HasByteSel, "$byte_sel", "");
string 3PMods = !if(IsVOP3P,
!if(HasOpSel, "$op_sel_hi", "")
#!if(HasModifiers, "$neg_lo$neg_hi", ""),
@@ -2065,7 +2069,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
string clamp = !if(HasClamp, "$clamp", "");
string omod = !if(HasOMod, "$omod", "");
- string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#3PMods#clamp#omod, "");
+ string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
}
@@ -2282,6 +2286,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsSWMMAC = 0;
field bit IsFP8 = 0;
+ field bit IsFP8DstByteSel = 0;
field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
@@ -2401,7 +2406,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
- HasModifiers, DstVT>.ret;
+ HasModifiers, DstVT, IsFP8DstByteSel>.ret;
field string Asm64 = AsmVOP3Base;
field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 647595d9ccab30..616bc7684753e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -580,6 +580,22 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
HasSrc2FloatMods>.ret>.ret);
}
+class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
+ VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
+ let IsFP8DstByteSel = 1;
+ let HasClamp = 0;
+ defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
+ let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+ bytesel);
+ let InsVOP3Base = !con(
+ getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+ Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+ Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret,
+ bytesel);
+}
+
def IsPow2Plus1: PatLeaf<(i32 imm), [{
uint32_t V = N->getZExtValue();
return isPowerOf2_32(V - 1);
@@ -645,12 +661,17 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+
+ let SubtargetPredicate = isGFX12Plus in {
+ defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+ defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+ }
}
// These instructions have non-standard use of op_sel. In particular they are
// using op_sel bits 2 and 3 while only having two sources. Therefore dummy
// src2 is used to hold the op_sel value.
- let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+ let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
}
@@ -667,15 +688,28 @@ class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst>
!if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
>;
+class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType SrcVT> : GCNPat<
+ (i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
+ i32:$old, timm:$byte_sel)),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
+>;
+
let OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
}
-foreach Index = [0, 1, 2, 3] in {
- def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
- def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+let SubtargetPredicate = isGFX940Plus in {
+ foreach Index = [0, 1, 2, 3] in {
+ def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+ def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+ }
+}
+
+let SubtargetPredicate = isGFX12Plus in {
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
}
}
@@ -1040,8 +1074,8 @@ defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
-defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>;
-defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>;
+defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
+defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
//===----------------------------------------------------------------------===//
// GFX11, GFX12
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index da16178cb58bd8..7cdb5cbfe297d5 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -311,6 +311,14 @@ class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
}
+ class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ bits<2> byte_sel;
+
+ let Inst{11} = 0; // op_sel0
+ let Inst{12} = 0; // op_sel1
+ let Inst{14-13} = byte_sel; // op_sel2/3
+ }
+
class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
let Inst{11} = ?;
let Inst{12} = ?;
@@ -741,6 +749,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
bits<3> src2_modifiers;
bits<1> clamp;
bits<2> omod;
+ bits<2> byte_sel;
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
@@ -748,8 +757,8 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
- let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
- let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
+ let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
+ let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
let Inst{15} = !if(P.HasClamp, clamp, 0);
let Inst{25-16} = op;
let Inst{31-26} = 0x35;
@@ -1388,7 +1397,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- if ps.Pfl.HasOpSel then {
+ if ps.Pfl.IsFP8DstByteSel then {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
+ } if ps.Pfl.HasOpSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1419,6 +1432,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+ } else if ps.Pfl.IsFP8DstByteSel then {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
} else if ps.Pfl.HasOpSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index e21d61036375a1..ffedde9416bb26 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -97,9 +97,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,9 +112,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 9b8fdf90170458..7662a3b78dea23 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -385,7 +385,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -409,7 +409,7 @@ define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -433,7 +433,7 @@ define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -479,7 +479,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -503,7 +503,7 @@ define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -527,7 +527,7 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index be9edc3e019e34..b0854881d42875 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -1126,6 +1126,18 @@ v_cvt_sr_fp8_f32 v10, s2, v5
v_cvt_sr_fp8_f32 v5, -|v255|, v4
// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 ; encoding: [0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 ; encoding: [0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 ; encoding: [0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
v_cvt_sr_bf8_f32 v1, v2, v3
// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
@@ -1135,6 +1147,18 @@ v_cvt_sr_bf8_f32 v10, s2, v5
v_cvt_sr_bf8_f32 v5, -|v255|, v4
// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 ; encoding: [0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 ; encoding: [0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 ; encoding: [0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
v_cvt_pk_i16_f32 v5, v1, v2
// GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index d0e309adce4101..16cd8d5aa5e9d5 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1192,6 +1192,18 @@ v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:
v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
@@ -1219,6 +1231,18 @@ v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:
v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
// GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 25b13ac62e4a46..d6ef14cff5fa83 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -698,6 +698,18 @@ v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -710,6 +722,18 @@ v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
index 55c5fcabea7327..31ed577ac0a233 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
@@ -101,3 +101,8 @@ v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
// GFX12: error: invalid op_sel operand
// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
// GFX12-NEXT:{{^}} ^
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-NEXT:{{^}}v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
+// GFX12-NEXT:{{^}} ^
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 6acaa81527207a..2c911777ef97f4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -1020,6 +1020,15 @@
# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 ; encoding: [0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 ; encoding: [0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 ; encoding: [0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00
+
# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
@@ -1029,6 +1038,15 @@
# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 ; encoding: [0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 ; encoding: [0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 ; encoding: [0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00
+
# GFX12: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 0771e6449b62b3..f9b6c1b73ddc4f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -945,6 +945,15 @@
# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
@@ -972,6 +981,15 @@
# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
# GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index a836adafb31e4d..eedc6d49108782 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -570,6 +570,15 @@
# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -582,6 +591,15 @@
# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
# GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
More information about the llvm-commits
mailing list