[llvm] 5e007af - [AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions (#118589)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 05:08:13 PST 2024
Author: Pravin Jagtap
Date: 2024-12-11T18:38:10+05:30
New Revision: 5e007afa9d4f175decc328ee89533a5fe89be99b
URL: https://github.com/llvm/llvm-project/commit/5e007afa9d4f175decc328ee89533a5fe89be99b
DIFF: https://github.com/llvm/llvm-project/commit/5e007afa9d4f175decc328ee89533a5fe89be99b.diff
LOG: [AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions (#118589)
Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when
partially writing to high bytes.
Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop
for above cases irrespective of opsel values.
Note: We might need to add few others into the same table.
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ecf03b14143ee3..5207201e14c091 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -916,21 +916,30 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
- if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
- return nullptr;
- } else {
- // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
- // with op_sel[3:2] != 0)
- if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
- !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
- SISrcMods::DST_OP_SEL ||
- (AMDGPU::isFP8DstSelInst(Opcode) &&
- (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
- SISrcMods::OP_SEL_0))))
- return nullptr;
- }
-
- return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ }
+
+ AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
+ // Type 2: VOP3 which write the hi bits
+ if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
+ SISrcMods::DST_OP_SEL)
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
+ if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
+ (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
+ SISrcMods::OP_SEL_0))
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ }
+
+ // Special case: nop is required for all the opsel values for fp4 sr variant
+ // cvt scale instructions
+ if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ return nullptr;
}
/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7bc6db4cec1065..bb78e77a9dc1a6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2567,6 +2567,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
field bit HasFP8DstByteSel = 0;
+ field bit HasFP4DstByteSel = 0;
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -3249,13 +3250,13 @@ def isMFMA_F8F6F4Table : GenericTable {
let PrimaryKeyName = "isMFMA_F8F6F4" ;
}
-def FP8DstByteSelTable : GenericTable {
+def FP4FP8DstByteSelTable : GenericTable {
let FilterClass = "VOP3_Pseudo";
- let CppTypeName = "FP8DstByteSelInfo";
- let Fields = ["Opcode", "HasFP8DstByteSel"];
+ let CppTypeName = "FP4FP8DstByteSelInfo";
+ let Fields = ["Opcode", "HasFP8DstByteSel", "HasFP4DstByteSel"];
let PrimaryKey = ["Opcode"];
- let PrimaryKeyName = "getFP8DstByteSelHelper";
+ let PrimaryKeyName = "getFP4FP8DstByteSelHelper";
}
def VOPDComponentTable : GenericTable {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 70deaf82dfaf34..f5b15cb0246a87 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -378,17 +378,18 @@ struct VOPTrue16Info {
bool IsTrue16;
};
-#define GET_FP8DstByteSelTable_DECL
-#define GET_FP8DstByteSelTable_IMPL
+#define GET_FP4FP8DstByteSelTable_DECL
+#define GET_FP4FP8DstByteSelTable_IMPL
struct DPMACCInstructionInfo {
uint16_t Opcode;
bool IsDPMACCInstruction;
};
-struct FP8DstByteSelInfo {
+struct FP4FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
+ bool HasFP4DstByteSel;
};
#define GET_MTBUFInfoTable_DECL
@@ -655,9 +656,16 @@ bool isTrue16Inst(unsigned Opc) {
return Info ? Info->IsTrue16 : false;
}
-bool isFP8DstSelInst(unsigned Opc) {
- const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
- return Info ? Info->HasFP8DstByteSel : false;
+FPType getFPDstSelType(unsigned Opc) {
+ const FP4FP8DstByteSelInfo *Info = getFP4FP8DstByteSelHelper(Opc);
+ if (!Info)
+ return FPType::None;
+ if (Info->HasFP8DstByteSel)
+ return FPType::FP8;
+ if (Info->HasFP4DstByteSel)
+ return FPType::FP4;
+
+ return FPType::None;
}
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ea497d7b239d7e..29f64d0db8dd2e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -55,6 +55,8 @@ static constexpr unsigned GFX12 = 1;
enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5, AMDHSA_COV6 = 6 };
+enum class FPType { None, FP4, FP8 };
+
/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI);
@@ -885,7 +887,7 @@ LLVM_READONLY
bool isTrue16Inst(unsigned Opc);
LLVM_READONLY
-bool isFP8DstSelInst(unsigned Opc);
+FPType getFPDstSelType(unsigned Opc);
LLVM_READONLY
bool isInvalidSingleUseConsumerInst(unsigned Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 8a9f8aa3d16d3a..94dce739b08b5f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
- let HasFP8DstByteSel = 1;
+ let HasFP4DstByteSel = 1;
}
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
- let HasFP8DstByteSel = 1;
+ let HasFP4DstByteSel = 1;
}
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5c4d96d3688b87..9366e11233571c 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsSWMMAC = P.IsSWMMAC;
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
+ bit HasFP4DstByteSel = P.HasFP4DstByteSel;
let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
P.AsmVOP3OpSel,
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 1bbad901d16b2c..49576433ab54da 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -642,17 +642,18 @@ body: |
...
---
-name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
+name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
+ ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_NOP 0
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
@@ -731,17 +732,18 @@ body: |
...
---
-name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
+name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
+ ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_NOP 0
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
@@ -1119,17 +1121,18 @@ body: |
...
---
-name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
+name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
+ ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_NOP 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
More information about the llvm-commits
mailing list