[llvm] 12fd00e - [AMDGPU] Add patterns for GFX11 v_minmax and v_maxmin instructions
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 5 08:07:58 PDT 2022
Author: Jay Foad
Date: 2022-07-05T16:07:47+01:00
New Revision: 12fd00ee17773798a1ddb0157b3d3df9f0e8dad2
URL: https://github.com/llvm/llvm-project/commit/12fd00ee17773798a1ddb0157b3d3df9f0e8dad2
DIFF: https://github.com/llvm/llvm-project/commit/12fd00ee17773798a1ddb0157b3d3df9f0e8dad2.diff
LOG: [AMDGPU] Add patterns for GFX11 v_minmax and v_maxmin instructions
Differential Revision: https://reviews.llvm.org/D128445
Added:
llvm/test/CodeGen/AMDGPU/minmax.ll
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/clamp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 829669157893..8972bce30dc6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3012,6 +3012,35 @@ multiclass Int16Med3Pat<Instruction med3Inst,
def : FPMed3Pat<f32, V_MED3_F32_e64>;
+class
+IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : AMDGPUPat <
+ (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1),
+ i32:$src2),
+ (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+
+class
+FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : GCNPat <
+ (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+ (VOP3Mods vt:$src1, i32:$src1_mods)),
+ (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let OtherPredicates = [isGFX11Plus] in {
+def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
+def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
let OtherPredicates = [isGFX9Plus] in {
def : FP16Med3Pat<f16, V_MED3_F16_e64>;
defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index d18b2b55d048..b2770092456d 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -268,9 +268,8 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
-; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -347,9 +346,8 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
-; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
new file mode 100644
index 000000000000..7f82f5d0a807
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -0,0 +1,422 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+
+define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+ %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
+ ret i32 %sminmax
+}
+
+define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_max_i32 s0, s0, s1
+; SDAG-NEXT: s_mov_b32 s5, s4
+; SDAG-NEXT: s_min_i32 s0, s0, s2
+; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-NEXT: s_mov_b32 s4, s3
+; SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_max_i32 s0, s0, s1
+; GISEL-NEXT: s_mov_b32 s6, s3
+; GISEL-NEXT: s_min_i32 s0, s0, s2
+; GISEL-NEXT: s_mov_b32 s7, s4
+; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT: global_store_b32 v1, v0, s[6:7]
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+ %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
+ store i32 %sminmax, i32 addrspace(1)* %out
+ ret void
+}
+
+define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_commuted_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+ %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
+ ret i32 %sminmax
+}
+
+define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+ %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
+ ret i32 %smaxmin
+}
+
+define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_commuted_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+ %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
+ ret i32 %smaxmin
+}
+
+define void @test_smed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
+; GFX11-LABEL: test_smed3_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_med3_i32 v2, v2, v3, v4
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
+ %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
+ %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @llvm.smax.i32(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_u32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+ %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
+ ret i32 %uminmax
+}
+
+define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_u32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_max_u32 s0, s0, s1
+; SDAG-NEXT: s_mov_b32 s5, s4
+; SDAG-NEXT: s_min_u32 s0, s0, s2
+; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-NEXT: s_mov_b32 s4, s3
+; SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_u32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_max_u32 s0, s0, s1
+; GISEL-NEXT: s_mov_b32 s6, s3
+; GISEL-NEXT: s_min_u32 s0, s0, s2
+; GISEL-NEXT: s_mov_b32 s7, s4
+; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT: global_store_b32 v1, v0, s[6:7]
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+ %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c)
+ store i32 %sminmax, i32 addrspace(1)* %out
+ ret void
+}
+
+define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_commuted_u32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+ %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
+ ret i32 %uminmax
+}
+
+define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_u32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+ %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
+ ret i32 %umaxmin
+}
+
+define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_commuted_u32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+ %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
+ ret i32 %umaxmin
+}
+
+define void @test_umed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
+; GFX11-LABEL: test_umed3_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_med3_u32 v2, v2, v3, v4
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
+ %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
+ %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @llvm.umax.i32(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
+; SDAG-LABEL: test_minmax_f32_ieee_true:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-NEXT: v_max_f32_e32 v2, v2, v2
+; SDAG-NEXT: v_maxmin_f32 v0, v0, v1, v2
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_minmax_f32_ieee_true:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GISEL-NEXT: v_maxmin_f32 v0, v0, v1, v2
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ %minmax = call float @llvm.minnum.f32(float %max, float %c)
+ ret float %minmax
+}
+
+define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, float addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-NEXT: s_mov_b32 s5, s4
+; SDAG-NEXT: s_mov_b32 s4, s3
+; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0
+; SDAG-NEXT: global_store_b32 v1, v0, s[4:5]
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT: s_mov_b32 s6, s3
+; GISEL-NEXT: s_mov_b32 s7, s4
+; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0
+; GISEL-NEXT: global_store_b32 v1, v0, s[6:7]
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %smax = call float @llvm.maxnum.f32(float %a, float %b)
+ %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
+ store float %sminmax, float addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b, float %c) {
+; GFX11-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2
+; GFX11-NEXT: ; return to shader part epilog
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ %minmax = call float @llvm.minnum.f32(float %c, float %max)
+ ret float %minmax
+}
+
+define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
+; SDAG-LABEL: test_maxmin_f32_ieee_true:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-NEXT: v_max_f32_e32 v2, v2, v2
+; SDAG-NEXT: v_minmax_f32 v0, v0, v1, v2
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_maxmin_f32_ieee_true:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-NEXT: v_max_f32_e32 v2, v2, v2
+; GISEL-NEXT: v_minmax_f32 v0, v0, v1, v2
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %min = call float @llvm.minnum.f32(float %a, float %b)
+ %maxmin = call float @llvm.maxnum.f32(float %min, float %c)
+ ret float %maxmin
+}
+
+define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b, float %c) {
+; GFX11-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2
+; GFX11-NEXT: ; return to shader part epilog
+ %min = call float @llvm.minnum.f32(float %a, float %b)
+ %maxmin = call float @llvm.maxnum.f32(float %c, float %min)
+ ret float %maxmin
+}
+
+define void @test_med3_f32(float addrspace(1)* %arg, float %x, float %y, float %z) #0 {
+; GFX11-LABEL: test_med3_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
+ %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
+ %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
+ %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ store float %tmp3, float addrspace(1)* %arg
+ ret void
+}
+
+define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
+; GFX11-LABEL: test_minmax_f16_ieee_false:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; GFX11-NEXT: ; return to shader part epilog
+ %max = call half @llvm.maxnum.f16(half %a, half %b)
+ %minmax = call half @llvm.minnum.f16(half %max, half %c)
+ ret half %minmax
+}
+
+define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, half addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-NEXT: s_mov_b32 s5, s4
+; SDAG-NEXT: s_mov_b32 s4, s3
+; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0
+; SDAG-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT: s_mov_b32 s6, s3
+; GISEL-NEXT: s_mov_b32 s7, s4
+; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0
+; GISEL-NEXT: global_store_b16 v1, v0, s[6:7]
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %smax = call half @llvm.maxnum.f16(half %a, half %b)
+ %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
+ store half %sminmax, half addrspace(1)* %out
+ ret void
+}
+
+define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
+; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT: v_max_f16_e32 v1, v1, v1
+; SDAG-NEXT: v_max_f16_e32 v0, v0, v0
+; SDAG-NEXT: v_max_f16_e32 v2, v2, v2
+; SDAG-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT: v_max_f16_e32 v0, v0, v0
+; GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GISEL-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %max = call half @llvm.maxnum.f16(half %a, half %b)
+ %minmax = call half @llvm.minnum.f16(half %c, half %max)
+ ret half %minmax
+}
+
+define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
+; GFX11-LABEL: test_maxmin_f16_ieee_false:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
+; GFX11-NEXT: ; return to shader part epilog
+ %min = call half @llvm.minnum.f16(half %a, half %b)
+ %maxmin = call half @llvm.maxnum.f16(half %min, half %c)
+ ret half %maxmin
+}
+
+define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
+; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT: v_max_f16_e32 v1, v1, v1
+; SDAG-NEXT: v_max_f16_e32 v0, v0, v0
+; SDAG-NEXT: v_max_f16_e32 v2, v2, v2
+; SDAG-NEXT: v_minmax_f16 v0, v0, v1, v2
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT: v_max_f16_e32 v0, v0, v0
+; GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GISEL-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-NEXT: v_minmax_f16 v0, v0, v1, v2
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %min = call half @llvm.minnum.f16(half %a, half %b)
+ %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
+ ret half %maxmin
+}
+
+define void @test_med3_f16(half addrspace(1)* %arg, half %x, half %y, half %z) #0 {
+; GFX11-LABEL: test_med3_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_med3_f16 v2, v2, v3, v4
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
+ %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
+ %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
+ %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+ store half %tmp3, half addrspace(1)* %arg
+ ret void
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare half @llvm.minnum.f16(half, half)
+declare half @llvm.maxnum.f16(half, half)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+
More information about the llvm-commits
mailing list