[llvm] [AMDGPU] Fix gfx950 Trans32 latency (PR #156411)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 1 23:21:06 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Austin Kerbow (kerbowa)
<details>
<summary>Changes</summary>
Updated from 4->2 on gfx950
---
Full diff: https://github.com/llvm/llvm-project/pull/156411.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SISchedule.td (+5-1)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+6-6)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 8eecb1c1019ae..9319d43e2ce1f 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -172,7 +172,6 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write32Bit, 1>;
def : HWVALUWriteRes<WriteFloatCvt, 4>;
- def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
let ReleaseAtCycles = [4] in
@@ -231,6 +230,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteTrans64, 4>;
} // End RetireOOO = 1
@@ -249,6 +249,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteTrans64, 16>;
} // End RetireOOO = 1
@@ -269,6 +270,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 1>;
def : HWVALUWriteRes<WriteDoubleAdd, 1>;
def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteTrans64, 4>;
def : HWVALUWriteRes<WriteIntMul, 1>;
def : HWVALUWriteRes<Write64Bit, 1>;
@@ -292,6 +294,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 1>;
def : HWVALUWriteRes<WriteDoubleAdd, 1>;
def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteTrans64, 4>;
def : HWVALUWriteRes<WriteIntMul, 1>;
def : HWVALUWriteRes<Write64Bit, 1>;
@@ -326,6 +329,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 1>;
def : HWVALUWriteRes<WriteDoubleAdd, 1>;
def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans32, 2>;
def : HWVALUWriteRes<WriteTrans64, 4>;
def : HWVALUWriteRes<WriteIntMul, 1>;
def : HWVALUWriteRes<Write64Bit, 1>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 10e523d1a0cf1..e8a4329e7f5cf 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -30434,15 +30434,15 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2
; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1
; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x32a5705f, v1
+; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX950-NEXT: v_add_f32_e32 v1, v3, v1
; GFX950-NEXT: v_exp_f32_e32 v1, v1
-; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX950-NEXT: s_mov_b32 s0, 0xc2ce8ed0
+; GFX950-NEXT: v_ldexp_f32 v1, v1, v2
; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
; GFX950-NEXT: s_mov_b32 s0, 0x42b17218
-; GFX950-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
@@ -30834,15 +30834,15 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2
; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1
; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x33979a37, v1
+; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX950-NEXT: v_add_f32_e32 v1, v3, v1
; GFX950-NEXT: v_exp_f32_e32 v1, v1
-; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX950-NEXT: s_mov_b32 s0, 0xc23369f4
+; GFX950-NEXT: v_ldexp_f32 v1, v1, v2
; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
; GFX950-NEXT: s_mov_b32 s0, 0x421a209b
-; GFX950-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
``````````
</details>
https://github.com/llvm/llvm-project/pull/156411
More information about the llvm-commits
mailing list