[llvm] 39337ff - AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard (#117844)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 2 06:23:20 PST 2024
Author: Matt Arsenault
Date: 2024-12-02T09:23:17-05:00
New Revision: 39337ff2dc366fde83b07193b72c294a846c5959
URL: https://github.com/llvm/llvm-project/commit/39337ff2dc366fde83b07193b72c294a846c5959
DIFF: https://github.com/llvm/llvm-project/commit/39337ff2dc366fde83b07193b72c294a846c5959.diff
LOG: AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard (#117844)
gfx950 SP changes doc says:
No 4 clk forwarding on opcodes that convert from
F32/F16->F8 or F32/F16->F4. Must insert a NOP or
instruction writing some other destination VREG
after a conversion to F4/F8 since it writes either
low/high half or bytes.
Co-authored-by: Pravin Jagtap <Pravin.Jagtap at amd.com>
Co-authored-by: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 4c37ef8855a5ba..ecf03b14143ee3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -909,8 +909,9 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
// There are three
diff erent types of instructions
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
- // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
- // CVT_SR_BF8_F32 with op_sel[3:2]
+ // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
+ // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
+ // op_sel[3:2]
// != 0
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
@@ -918,8 +919,8 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
- // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
- // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
+ // with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
@@ -983,7 +984,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
- if (ST.hasDstSelForwardingHazard()) {
+ if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
const int Shift16DefWaitstates = 1;
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
@@ -1094,7 +1095,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
// problematic thus far.
// see checkVALUHazards()
- if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
+ if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
+ !ST.hasCvtScaleForwardingHazard())
return 0;
const MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..5cecaf6349c883 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1264,6 +1264,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
+ bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
+
bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
bool requiresCodeObjectV6() const { return RequiresCOV6; }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ab5f0694c07f95..5a0e812748fbb7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -378,6 +378,14 @@ struct VOPTrue16Info {
bool IsTrue16;
};
+#define GET_FP8DstByteSelTable_DECL
+#define GET_FP8DstByteSelTable_IMPL
+
+struct DPMACCInstructionInfo {
+ uint16_t Opcode;
+ bool IsDPMACCInstruction;
+};
+
struct FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
@@ -418,6 +426,8 @@ struct FP8DstByteSelInfo {
#define GET_getMFMA_F8F6F4_WithSize_DECL
#define GET_getMFMA_F8F6F4_WithSize_IMPL
#define GET_isMFMA_F8F6F4Table_IMPL
+#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL
+
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9f7fbec6a542f7..ea497d7b239d7e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info {
uint8_t NumRegsSrcB;
};
+struct CvtScaleF32_F32F16ToF8F4_Info {
+ unsigned Opcode;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info {
#define GET_MAIInstInfoTable_DECL
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
+#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c8c36714909adf..1160975f3302a9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -970,11 +970,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
let HasOMod = 0;
}
+class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
+ let HasFP8DstByteSel = 1;
+}
+
class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
FP32InputMods:$src2_modifiers, Src2RC64:$src2,
VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let HasFP8DstByteSel = 1;
}
@@ -992,6 +997,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods>.ret);
let HasExtVOP3DPP = 0;
+ let HasFP8DstByteSel = 1;
}
class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
@@ -1004,6 +1010,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
+ let HasFP8DstByteSel = 1;
}
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1015,6 +1022,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
+ let HasFP8DstByteSel = 1;
}
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
@@ -1090,7 +1098,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
- defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
+ defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
@@ -2047,6 +2055,7 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
}
+
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 75834316750951..6a25e346c89447 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -255,3 +255,399 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...
+
+
+---
+# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_bf16_hazard
+# GCN: V_CVT_SCALEF32_SR_FP8_BF16_e64
+# GCN: GLOBAL_STORE_DWORD
+name: test_cvt_scalef32_sr_fp8_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f16_hazard
+# GCN: V_CVT_SCALEF32_SR_FP8_F16_e64
+# GCN: GLOBAL_STORE_DWORD
+name: test_cvt_scalef32_sr_fp8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f32_hazard
+# GCN: V_CVT_SCALEF32_SR_FP8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_sr_fp8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec
+ renamable $vgpr2 = V_ADD_U32_e32 4, killed $vgpr5, implicit $exec
+ GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_hazard
+# GCN: V_CVT_SCALEF32_PK_FP8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_fp8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec
+ renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_fp8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_hazard
+# GCN: V_CVT_SCALEF32_SR_BF8_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_pk_fp8_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_sr_bf8_f16_hazard
+# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_sr_bf8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_sr_bf8_f32_hazard
+# GCN: V_CVT_SCALEF32_SR_BF8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_sr_bf8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_hazard
+# GCN: V_CVT_SCALEF32_PK_BF8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_bf8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec
+ renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_bf8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_hazard
+# GCN: V_CVT_SCALEF32_PK_BF8_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_bf8_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_BF16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scale_fp4_f32_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scale_fp4_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec
+ renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 4, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_f16_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 8, killed $vgpr2_vgpr3, 0, killed $vgpr4, 4, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_fp4_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_fp4_bf16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_fp4_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_hazard_skipping_over_meta_instr
+# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_hazard_skipping_over_meta_instr
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ $vgpr4 = KILL
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_f16_to_fp4_to_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_CVT_SCALEF32_PK_F16_FP4_e64
+# GCN: S_SETPC_B64_return
+name: test_cvt_f16_to_fp4_to_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 4, killed $vgpr2, 0, killed $vgpr1, 0, implicit $mode, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_hazard_pseudo
+# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_hazard_pseudo
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ WAVE_BARRIER
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_call_consuming_cvt_scalef32_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: SI_CALL
+name: test_call_consuming_cvt_scalef32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ BUNDLE implicit-def $sgpr0_sgpr1, implicit-def $sgpr0, implicit-def $sgpr0_lo16, implicit-def $sgpr0_hi16, implicit-def $sgpr1, implicit-def $sgpr1_lo16, implicit-def $sgpr1_hi16, implicit-def $scc {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 internal $sgpr0, target-flags(amdgpu-gotprel32-lo) @test_cvt_scalef32_hazard_pseudo + 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 internal $sgpr1, target-flags(amdgpu-gotprel32-hi) @test_cvt_scalef32_hazard_pseudo + 12, implicit-def $scc, implicit internal $scc
+ }
+ renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 0, 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr0_sgpr1, @test_cvt_scalef32_hazard_pseudo, csr_amdgpu_gfx90ainsts, implicit undef $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit undef $sgpr10_sgpr11, implicit undef $sgpr12, implicit undef $sgpr13, implicit undef $sgpr14, implicit-def $sgpr15, implicit undef $vgpr31, implicit killed $vgpr2, implicit-def $vgpr2
+ SI_RETURN_TO_EPILOG killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_inlineasm_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: INLINEASM
+name: test_cvt_scalef32_inlineasm_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, killed renamable $vgpr2
+ S_SETPC_B64_return undef $sgpr30_sgpr31
+...
+
+---
+# GCN-LABEL: test_cvt_scale_cvt_scale_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: S_SETPC_B64_return
+name: test_cvt_scale_cvt_scale_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scale_cvt_scale_waw_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64
+# GCN: S_SETPC_B64_return
+name: test_cvt_scale_cvt_scale_waw_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ early-clobber renamable $vgpr2 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr1, 0, implicit $mode, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index f80f2935856e36..046a72b9307d09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -1305,6 +1305,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 1)
@@ -1351,6 +1352,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index d4110850f32066..7646197f13175b 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX940 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10_1 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10_3 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
@@ -2249,5 +2249,3 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; are reserved at the end for xnack + vcc).
attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX9: {{.*}}
More information about the llvm-commits
mailing list