[llvm] [AMDGPU] Handle hazard in v_cvt_scalef32_pk_{fp|bf}8_{f|bf}16. (PR #140218)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 16 01:08:10 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pravin Jagtap (pravinjagtap)
<details>
<summary>Changes</summary>
Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes.
Experiments in SWDEV-531672 suggests that we need nop for above cases irrespective of opsel values.
---
Full diff: https://github.com/llvm/llvm-project/pull/140218.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+18)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+1)
- (modified) llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir (+21-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 1561efe2cd295..ec2e33b202842 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -150,6 +150,18 @@ static bool isPermlane(const MachineInstr &MI) {
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
}
+static bool isCVTScaleConvertsF16BF16ToFP8BF8(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_CVT_SCALEF32_PK_FP8_F16_e64:
+ case AMDGPU::V_CVT_SCALEF32_PK_FP8_BF16_e64:
+ case AMDGPU::V_CVT_SCALEF32_PK_BF8_F16_e64:
+ case AMDGPU::V_CVT_SCALEF32_PK_BF8_BF16_e64:
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool isLdsDma(const MachineInstr &MI) {
return SIInstrInfo::isVALU(MI) &&
(SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
@@ -899,6 +911,12 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
SISrcMods::DST_OP_SEL)
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ // Special case: F16BF16 to FP8BF8 class of cvt scale instructions need nop
+ // irrespective of the op_sel value.
+ if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
+ isCVTScaleConvertsF16BF16ToFP8BF8(MI))
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
(TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a7b90b9e319da..d9a4a255b53f7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1082,6 +1082,7 @@ class VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Pr
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
+ let HasFP8DstByteSel = 1;
}
class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 49576433ab54d..d9fc552980780 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -381,11 +381,30 @@ body: |
...
---
-name: test_cvt_scalef32_pk_fp8_f16_hazard
+name: test_cvt_scalef32_pk_fp8_f16_hazard_write_low_half
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; GCN-LABEL: name: test_cvt_scalef32_pk_fp8_f16_hazard
+ ; GCN-LABEL: name: test_cvt_scalef32_pk_fp8_f16_hazard_write_low_half
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 0, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 0, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: test_cvt_scalef32_pk_fp8_f16_hazard_write_hi_half
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-LABEL: name: test_cvt_scalef32_pk_fp8_f16_hazard_write_hi_half
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
``````````
</details>
https://github.com/llvm/llvm-project/pull/140218
More information about the llvm-commits
mailing list