[llvm] [AMDGPU] Encode unused VALU src0/1/2 fields as inline 0 on GFX10+ (PR #175753)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 14 08:52:24 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
This has no functional effect since these source fields are unused, but
it can avoid some conservative stalls due to these instructions looking
like they read from an SGPR, since 0 is the encoding for s0. Using 0x80
is more benign since it is the encoding for inline immediate 0.
Fixes: SWDEV-574953
---
Patch is 24.07 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175753.diff
128 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+22-1)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+1)
- (modified) llvm/lib/Target/AMDGPU/VOPCInstructions.td (+8-4)
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.ll (+23-23)
- (modified) llvm/test/CodeGen/AMDGPU/emit-high-vgprs.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/imm16.ll (+69-69)
- (modified) llvm/test/CodeGen/AMDGPU/immv216.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/permlane16_var-op-sel.ll (+1-1)
- (modified) llvm/test/MC/AMDGPU/expressions-gfx10.s (+5-5)
- (modified) llvm/test/MC/AMDGPU/gfx10-constant-bus.s (+1-1)
- (modified) llvm/test/MC/AMDGPU/gfx1030_new.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/gfx10_asm_vop1.s (+1258-1258)
- (modified) llvm/test/MC/AMDGPU/gfx10_asm_vop2.s (+1129-1129)
- (modified) llvm/test/MC/AMDGPU/gfx10_asm_vop3.s (+1374-1374)
- (modified) llvm/test/MC/AMDGPU/gfx10_asm_vopc_e64.s (+5421-5421)
- (modified) llvm/test/MC/AMDGPU/gfx10_asm_vopcx.s (+2365-2365)
- (modified) llvm/test/MC/AMDGPU/gfx11-promotions-fake16.s (+16-16)
- (modified) llvm/test/MC/AMDGPU/gfx11-promotions.s (+16-16)
- (modified) llvm/test/MC/AMDGPU/gfx1150_asm_features-fake16.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/gfx1150_asm_features.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_operands.s (+12-12)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1.s (+19-19)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_fake16_promote.s (+355-355)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s (+710-710)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop2_fake16_promote.s (+23-23)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s (+48-48)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3-fake16.s (+752-752)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3.s (+783-783)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias-fake16.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx-fake16.s (+896-896)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s (+983-983)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx-fake16.s (+226-226)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s (+313-313)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_features.s (+4-4)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1-fake16.s (+1169-1169)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s (+1237-1237)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2-fake16.s (+585-585)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s (+609-609)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc-fake16.s (+2645-2645)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc.s (+2802-2802)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx-fake16.s (+1372-1372)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s (+1431-1431)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s (+241-241)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3p_features.s (+16-16)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vopc_fake16_promote.s (+269-269)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s (+598-598)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vopcx_fake16_promote.s (+162-162)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s (+360-360)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s (+39-39)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+194-194)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+194-194)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+1422-1422)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+1490-1490)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3cx.s (+1137-1137)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s (+106-106)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_features.s (+1-1)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop1.s (+5-5)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop1_fake16_promote.s (+355-355)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s (+697-697)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop2_fake16_promote.s (+23-23)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s (+47-47)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3-fake16.s (+1014-1014)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+1049-1049)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases-fake16.s (+4-4)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s (+4-4)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1-fake16.s (+1215-1215)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s (+1287-1287)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2-fake16.s (+612-612)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s (+636-636)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3c-fake16.s (+2193-2193)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3c.s (+2340-2340)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3cx-fake16.s (+1137-1137)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s (+1192-1192)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16-fake16.s (+864-864)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s (+945-945)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8-fake16.s (+298-298)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s (+379-379)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s (+271-271)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_features.s (+16-16)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vopc_fake16_promote.s (+269-269)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vopc_t16_promote.s (+538-538)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vopcx_fake16_promote.s (+162-162)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s (+324-324)
- (modified) llvm/test/MC/AMDGPU/lds_direct-gfx10.s (+2-2)
- (modified) llvm/test/MC/AMDGPU/literals.s (+90-90)
- (modified) llvm/test/MC/AMDGPU/literalv216.s (+52-52)
- (modified) llvm/test/MC/AMDGPU/reg-syntax-extra.s (+1-1)
- (modified) llvm/test/MC/AMDGPU/reloc-operands-gfx10.s (+1-1)
- (modified) llvm/test/MC/AMDGPU/vcmpx-gfx10.s (+7-7)
- (modified) llvm/test/MC/AMDGPU/vop3-literal.s (+110-110)
- (modified) llvm/test/MC/AMDGPU/wave32.s (+14-14)
- (modified) llvm/test/MC/AMDGPU/wave_any.s (+15-15)
- (modified) llvm/test/MC/AMDGPU/writelane_m0.s (+3-3)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10-sgpr-max.txt (+1-1)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10-vop3-literal.txt (+14-14)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt (+2-2)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt (+3733-3733)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3c.txt (+5392-5392)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3cx.txt (+2370-2370)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt (+34-34)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt (+1643-1643)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt (+1450-1450)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt (+454-454)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt (+1540-1540)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt (+908-908)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopc.txt (+2919-2919)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt (+1509-1509)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt (+240-240)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt (+39-39)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+241-241)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt (+1987-1987)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3cx.txt (+1137-1137)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt (+104-104)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+2028-2028)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+8-8)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt (+1559-1559)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt (+4-1)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2.txt (+582-582)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt (+2521-2521)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt (+1303-1303)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt (+1842-1842)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt (+497-497)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt (+270-270)
- (modified) llvm/test/MC/Disassembler/AMDGPU/literals.txt (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 49e94183202bd..598d6e46207b1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -96,6 +96,10 @@ class AMDGPUMCCodeEmitter : public MCCodeEmitter {
APInt &Inst, APInt &Scratch,
const MCSubtargetInfo &STI) const;
+ template <bool HasSrc0, bool HasSrc1, bool HasSrc2>
+ APInt postEncodeVOP3(const MCInst &MI, APInt EncodedValue,
+ const MCSubtargetInfo &STI) const;
+
APInt postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
const MCSubtargetInfo &STI) const;
};
@@ -719,6 +723,23 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
llvm_unreachable("Encoding of this operand type is not supported yet.");
}
+template <bool HasSrc0, bool HasSrc1, bool HasSrc2>
+APInt AMDGPUMCCodeEmitter::postEncodeVOP3(const MCInst &MI, APInt EncodedValue,
+ const MCSubtargetInfo &STI) const {
+ if (!AMDGPU::isGFX10Plus(STI))
+ return EncodedValue;
+ // Set unused source fields in VOP3 encodings to inline immediate 0 to avoid
+ // hardware conservatively assuming the instruction reads SGPRs.
+ constexpr uint64_t InlineImmediate0 = 0x80;
+ if (!HasSrc0)
+ EncodedValue |= InlineImmediate0 << 32;
+ if (!HasSrc1)
+ EncodedValue |= InlineImmediate0 << 41;
+ if (!HasSrc2)
+ EncodedValue |= InlineImmediate0 << 50;
+ return EncodedValue;
+}
+
APInt AMDGPUMCCodeEmitter::postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
const MCSubtargetInfo &STI) const {
// GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
@@ -732,7 +753,7 @@ APInt AMDGPUMCCodeEmitter::postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC));
EncodedValue |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
- return EncodedValue;
+ return postEncodeVOP3<true, true, false>(MI, EncodedValue, STI);
}
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a96d54a8210c3..cb6a413f993bc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -2478,6 +2478,7 @@ multiclass VOP3P_Real_LD_SCALE_gfx1250<bits<8> op> {
VOP3P_Real_Gen<ps, GFX1250Gen, ps.Mnemonic>,
VOP3Pe_gfx11_gfx12<op, ps.Pfl> {
let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
+ let PostEncoderMethod = "";
}
}
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 95e40dd8e99d9..989181b21f3e9 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -422,7 +422,6 @@ multiclass VOPC_Pseudos <string opName,
}
-let PostEncoderMethod = "postEncodeVOPCX" in
multiclass VOPCX_Pseudos <string opName,
VOPC_Profile P, VOPC_Profile P_NoSDst,
SDPatternOperator cond = COND_NULL,
@@ -1120,7 +1119,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
} // end SubtargetPredicate = isGFX11Plus
}
-let PostEncoderMethod = "postEncodeVOPCX" in
multiclass VOPCX_Class_Pseudos <string opName,
VOPC_Profile P,
VOPC_Profile P_NoSDst> :
@@ -1536,7 +1534,8 @@ class VOPC64_DPP<VOP_DPP_Pseudo ps, string opName = ps.OpName>
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
let Constraints = ps.Constraints;
- let PostEncoderMethod = ps.PostEncoderMethod;
+
+ let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX");
}
class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps,
@@ -1577,7 +1576,8 @@ class VOPC64_DPP8<VOP_Pseudo ps, string opName = ps.OpName>
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
let True16Predicate = ps.True16Predicate;
- let PostEncoderMethod = ps.PostEncoderMethod;
+
+ let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX");
}
class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1780,6 +1780,7 @@ multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
let Inst{7-0} = ?; // sdst
let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
# "{_e64} " # ps64.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
@@ -1841,6 +1842,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
let Inst{7-0} = ?; // sdst
let Inst{14} = 0;
let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
} else {
def _e64#Gen.Suffix
@@ -1848,6 +1850,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
let Inst{7-0} = ?; // sdst
let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
}
@@ -2189,6 +2192,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let Inst{7-0} = ?; // sdst
let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
# "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+ let PostEncoderMethod = "postEncodeVOPCX";
}
if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 6056cd22875a7..39a202dc9c379 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -199,9 +199,10 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
let mayStore = ps.mayStore;
let TRANS = ps.TRANS;
let isConvergent = ps.isConvergent;
- let PostEncoderMethod = ps.PostEncoderMethod;
VOPProfile Pfl = ps.Pfl;
+
+ let PostEncoderMethod = !if(!and(Pfl.HasSrc0, Pfl.HasSrc1, Pfl.HasSrc2), "", "postEncodeVOP3<"#Pfl.HasSrc0#","#Pfl.HasSrc1#","#Pfl.HasSrc2#">");
}
class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> :
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll
index fcbf7efdaa653..75af0b5f15306 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll
@@ -6,7 +6,7 @@ define i16 @cvt_pk_bf8_f16_v(ptr addrspace(1) %out) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_cvt_pk_bf8_f16 v0, 0x38003800 ; encoding: [0x00,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x38]
+; GFX1250-NEXT: v_cvt_pk_bf8_f16 v0, 0x38003800 ; encoding: [0x00,0x00,0x73,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x38]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%cvt = tail call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> <half 0xH3800, half 0xH3800>)
ret i16 %cvt
@@ -19,7 +19,7 @@ define i16 @cvt_pk_fp8_f16_v(ptr addrspace(1) %out) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_cvt_pk_fp8_f16 v0, 0x3800 ; encoding: [0x00,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+; GFX1250-NEXT: v_cvt_pk_fp8_f16 v0, 0x3800 ; encoding: [0x00,0x00,0x72,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%cvt = tail call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> <half 0xH3800, half 0xH0>)
ret i16 %cvt
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 00da5613820cd..f7c6db60678f9 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -145,13 +145,13 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
; GFX10-LABEL: v_mul_f32_vop3_src_mods:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x02]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: v_mul_f32_vop3_src_mods:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x02]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1200-LABEL: v_mul_f32_vop3_src_mods:
@@ -161,14 +161,14 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
; GFX1200-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x02]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1250-LABEL: v_mul_f32_vop3_src_mods:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX1250-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x02]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, %y
@@ -188,13 +188,13 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
; GFX10-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x02]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x02]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1200-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
@@ -204,14 +204,14 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
; GFX1200-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x02]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1250-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX1250-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x02]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, 4.0
@@ -233,13 +233,13 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX10-LABEL: v_mul_f32_vop3_src_mods_literal:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-; GFX10-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX10-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x02,0x00,0x00,0xf6,0x42]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: v_mul_f32_vop3_src_mods_literal:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX11-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x02,0x00,0x00,0xf6,0x42]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1200-LABEL: v_mul_f32_vop3_src_mods_literal:
@@ -249,14 +249,14 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX1200-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1200-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX1200-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x02,0x00,0x00,0xf6,0x42]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1250-LABEL: v_mul_f32_vop3_src_mods_literal:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX1250-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x02,0x00,0x00,0xf6,0x42]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, 123.0
@@ -280,7 +280,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX10-LABEL: v_mul_f32_vop2_frame_index:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s32 ; encoding: [0x01,0x00,0x16,0xd5,0x85,0x40,0x00,0x00]
+; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s32 ; encoding: [0x01,0x00,0x16,0xd5,0x85,0x40,0x00,0x02]
; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x10]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
@@ -609,13 +609,13 @@ define double @v_mul_f64_vop2_literal_32(double %x) {
; GFX10-LABEL: v_mul_f64_vop2_literal_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-; GFX10-NEXT: v_mul_f64 v[0:1], 0x405ec000, v[0:1] ; encoding: [0x00,0x00,0x65,0xd5,0xff,0x00,0x02,0x00,0x00,0xc0,0x5e,0x40]
+; GFX10-NEXT: v_mul_f64 v[0:1], 0x405ec000, v[0:1] ; encoding: [0x00,0x00,0x65,0xd5,0xff,0x00,0x02,0x02,0x00,0xc0,0x5e,0x40]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: v_mul_f64_vop2_literal_32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_f64 v[0:1], 0x405ec000, v[0:1] ; encoding: [0x00,0x00,0x28,0xd7,0xff,0x00,0x02,0x00,0x00,0xc0,0x5e,0x40]
+; GFX11-NEXT: v_mul_f64 v[0:1], 0x405ec000, v[0:1] ; encoding: [0x00,0x00,0x28,0xd7,0xff,0x00,0x02,0x02,0x00,0xc0,0x5e,0x40]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1200-LABEL: v_mul_f64_vop2_literal_32:
@@ -658,7 +658,7 @@ define double @v_mul_f64_vop2_literal_64(double %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_mov_b32 s4, 0x66666666 ; encoding: [0xff,0x03,0x84,0xbe,0x66,0x66,0x66,0x66]
; GFX10-NEXT: s_mov_b32 s5, 0x405ec666 ; encoding: [0xff,0x03,0x85,0xbe,0x66,0xc6,0x5e,0x40]
-; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; encoding: [0x00,0x00,0x65,0xd5,0x00,0x09,0x00,0x00]
+; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] ; encoding: [0x00,0x00,0x65,0xd5,0x00,0x09,0x00,0x02]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: v_mul_f64_vop2_literal_64:
@@ -667,7 +667,7 @@ define double @v_mul_f64_vop2_literal_64(double %x) {
; GFX11-NEXT: s_mov_b32 s0, 0x66666666 ; encoding: [0xff,0x00,0x80,0xbe,0x66,0x66,0x66,0x66]
; GFX11-NEXT: s_mov_b32 s1, 0x405ec666 ; encoding: [0xff,0x00,0x81,0xbe,0x66,0xc6,0x5e,0x40]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
-; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] ; encoding: [0x00,0x00,0x28,0xd7,0x00,0x01,0x00,0x00]
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] ; encoding: [0x00,0x00,0x28,0xd7,0x00,0x01,0x00,0x02]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX1200-LABEL: v_mul_f64_vop2_literal_64:
@@ -710,14 +710,14 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
; GFX10-LABEL: v_add_u64_vop2_literal_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x0f,0xd7,0xff,0x00,0x02,0x00,0x7b,0x00,0x00,0x00]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x0f,0xd7,0xff,0x00,0x02,0x02,0x7b,0x00,0x00,0x00]
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; encoding: [0x01,0x7d,0x28,0xd5,0x80,0x02,0xaa,0x01]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: v_add_u64_vop2_literal_32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x00,0x7b,0x00,0x00,0x00]
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x02,0x7b,0x00,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; encoding: [0x01,0x7c,0x20,0xd5,0x80,0x02,0xaa,0x01]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
@@ -729,7 +729,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
; GFX1200-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x00,0x7b,0x00,0x00,0x00]
+; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x02,0x7b,0x00,0x00,0x00]
; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) ; encoding: [0x9d,0xff,0x88,0xbf]
; GFX1200-NEXT: v_add_co_c...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/175753
More information about the llvm-commits
mailing list