[llvm] d09dbda - [AMDGPU] bf16 clamp folding (#152573)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 12:59:53 PDT 2025
Author: Stanislav Mekhanoshin
Date: 2025-08-07T12:59:50-07:00
New Revision: d09dbdabb93ffdd6df25ae487c95a552f13e5e16
URL: https://github.com/llvm/llvm-project/commit/d09dbdabb93ffdd6df25ae487c95a552f13e5e16
DIFF: https://github.com/llvm/llvm-project/commit/d09dbdabb93ffdd6df25ae487c95a552f13e5e16.diff
LOG: [AMDGPU] bf16 clamp folding (#152573)
Added:
Modified:
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/test/CodeGen/AMDGPU/bf16-math.ll
llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0c653b1b46d65..962c276bc2123 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_MAX_NUM_F64_e64:
- case AMDGPU::V_PK_MAX_F16: {
+ case AMDGPU::V_PK_MAX_F16:
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ case AMDGPU::V_PK_MAX_NUM_BF16: {
if (MI.mayRaiseFPException())
return nullptr;
@@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
// Having a 0 op_sel_hi would require swizzling the output in the source
// instruction, which we can't do.
- unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
- : 0u;
+ unsigned UnsetMods =
+ (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
+ ? SISrcMods::OP_SEL_1
+ : 0u;
if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
return nullptr;
return Src0;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 9979e832b799a..682b3b4d57209 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -368,10 +368,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
; GCN-LABEL: test_clamp_bf16_folding:
; GCN: ; %bb.0:
-; GCN-NEXT: v_exp_bf16_e32 v0, v0
-; GCN-NEXT: v_nop
-; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
@@ -382,9 +379,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
; GCN-LABEL: test_clamp_v2bf16_folding:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
; GCN-NEXT: ; return to shader part epilog
%mul = fmul <2 x bfloat> %src0, %src1
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
index 1b2eb83ba1726..439317269b724 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -74,10 +74,11 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+
+
+
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
%result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
@@ -191,10 +192,11 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+
+
+
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
%src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
%src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
@@ -247,12 +249,12 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
-; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+
+
+
%src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
%src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
%src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
More information about the llvm-commits
mailing list