[llvm] 12fd00e - [AMDGPU] Add patterns for GFX11 v_minmax and v_maxmin instructions

Tue Jul 5 08:07:58 PDT 2022

Author: Jay Foad
Date: 2022-07-05T16:07:47+01:00
New Revision: 12fd00ee17773798a1ddb0157b3d3df9f0e8dad2

URL: https://github.com/llvm/llvm-project/commit/12fd00ee17773798a1ddb0157b3d3df9f0e8dad2
DIFF: https://github.com/llvm/llvm-project/commit/12fd00ee17773798a1ddb0157b3d3df9f0e8dad2.diff

LOG: [AMDGPU] Add patterns for GFX11 v_minmax and v_maxmin instructions

Differential Revision: https://reviews.llvm.org/D128445

Added: 
    llvm/test/CodeGen/AMDGPU/minmax.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/test/CodeGen/AMDGPU/clamp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 829669157893..8972bce30dc6 100644

--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3012,6 +3012,35 @@ multiclass Int16Med3Pat<Instruction med3Inst,
 
 def : FPMed3Pat<f32, V_MED3_F32_e64>;
 
+class
+IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max,
+             SDPatternOperator max_or_min_oneuse> : AMDGPUPat <
+  (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1),
+                                i32:$src2),
+  (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+
+class
+FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+            SDPatternOperator max_or_min_oneuse> : GCNPat <
+  (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+                                 (VOP3Mods vt:$src1, i32:$src1_mods)),
+               (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+  (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+              DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let OtherPredicates = [isGFX11Plus] in {
+def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
+def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
 let OtherPredicates = [isGFX9Plus] in {
 def : FP16Med3Pat<f16, V_MED3_F16_e64>;
 defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;

diff  --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index d18b2b55d048..b2770092456d 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -268,9 +268,8 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    v_min_f32_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -347,9 +346,8 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    v_min_f32_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
new file mode 100644
index 000000000000..7f82f5d0a807
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -0,0 +1,422 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+
+define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
+  ret i32 %sminmax
+}
+
+define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_max_i32 s0, s0, s1
+; SDAG-NEXT:    s_mov_b32 s5, s4
+; SDAG-NEXT:    s_min_i32 s0, s0, s2
+; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-NEXT:    s_mov_b32 s4, s3
+; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_max_i32 s0, s0, s1
+; GISEL-NEXT:    s_mov_b32 s6, s3
+; GISEL-NEXT:    s_min_i32 s0, s0, s2
+; GISEL-NEXT:    s_mov_b32 s7, s4
+; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
+  %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
+  store i32 %sminmax, i32 addrspace(1)* %out
+  ret void
+}
+
+define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_commuted_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
+  ret i32 %sminmax
+}
+
+define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
+  ret i32 %smaxmin
+}
+
+define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_commuted_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
+  ret i32 %smaxmin
+}
+
+define void @test_smed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
+; GFX11-LABEL: test_smed3_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
+  %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
+  %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @llvm.smax.i32(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_u32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+  %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
+  ret i32 %uminmax
+}
+
+define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_u32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_max_u32 s0, s0, s1
+; SDAG-NEXT:    s_mov_b32 s5, s4
+; SDAG-NEXT:    s_min_u32 s0, s0, s2
+; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-NEXT:    s_mov_b32 s4, s3
+; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_u32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_max_u32 s0, s0, s1
+; GISEL-NEXT:    s_mov_b32 s6, s3
+; GISEL-NEXT:    s_min_u32 s0, s0, s2
+; GISEL-NEXT:    s_mov_b32 s7, s4
+; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
+  %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+  %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c)
+  store i32 %sminmax, i32 addrspace(1)* %out
+  ret void
+}
+
+define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_minmax_commuted_u32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+  %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
+  ret i32 %uminmax
+}
+
+define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_u32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
+  ret i32 %umaxmin
+}
+
+define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
+; GFX11-LABEL: test_maxmin_commuted_u32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
+  ret i32 %umaxmin
+}
+
+define void @test_umed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
+; GFX11-LABEL: test_umed3_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_med3_u32 v2, v2, v3, v4
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
+  %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
+  %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @llvm.umax.i32(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
+; SDAG-LABEL: test_minmax_f32_ieee_true:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_minmax_f32_ieee_true:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %minmax = call float @llvm.minnum.f32(float %max, float %c)
+  ret float %minmax
+}
+
+define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, float addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-NEXT:    s_mov_b32 s5, s4
+; SDAG-NEXT:    s_mov_b32 s4, s3
+; SDAG-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT:    s_mov_b32 s6, s3
+; GISEL-NEXT:    s_mov_b32 s7, s4
+; GISEL-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
+  %smax = call float @llvm.maxnum.f32(float %a, float %b)
+  %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
+  store float %sminmax, float addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b, float %c) {
+; GFX11-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %minmax = call float @llvm.minnum.f32(float %c, float %max)
+  ret float %minmax
+}
+
+define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
+; SDAG-LABEL: test_maxmin_f32_ieee_true:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_maxmin_f32_ieee_true:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %maxmin = call float @llvm.maxnum.f32(float %min, float %c)
+  ret float %maxmin
+}
+
+define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b, float %c) {
+; GFX11-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %maxmin = call float @llvm.maxnum.f32(float %c, float %min)
+  ret float %maxmin
+}
+
+define void @test_med3_f32(float addrspace(1)* %arg, float %x, float %y, float %z) #0 {
+; GFX11-LABEL: test_med3_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_med3_f32 v2, v2, v3, v4
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
+  %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
+  %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %tmp3, float addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
+; GFX11-LABEL: test_minmax_f16_ieee_false:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %max = call half @llvm.maxnum.f16(half %a, half %b)
+  %minmax = call half @llvm.minnum.f16(half %max, half %c)
+  ret half %minmax
+}
+
+define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, half addrspace(1)* inreg %out) {
+; SDAG-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-NEXT:    s_mov_b32 s5, s4
+; SDAG-NEXT:    s_mov_b32 s4, s3
+; SDAG-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; SDAG-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-NEXT:    s_mov_b32 s6, s3
+; GISEL-NEXT:    s_mov_b32 s7, s4
+; GISEL-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; GISEL-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
+  %smax = call half @llvm.maxnum.f16(half %a, half %b)
+  %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
+  store half %sminmax, half addrspace(1)* %out
+  ret void
+}
+
+define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
+; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %max = call half @llvm.maxnum.f16(half %a, half %b)
+  %minmax = call half @llvm.minnum.f16(half %c, half %max)
+  ret half %minmax
+}
+
+define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
+; GFX11-LABEL: test_maxmin_f16_ieee_false:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %min = call half @llvm.minnum.f16(half %a, half %b)
+  %maxmin = call half @llvm.maxnum.f16(half %min, half %c)
+  ret half %maxmin
+}
+
+define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
+; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %min = call half @llvm.minnum.f16(half %a, half %b)
+  %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
+  ret half %maxmin
+}
+
+define void @test_med3_f16(half addrspace(1)* %arg, half %x, half %y, half %z) #0 {
+; GFX11-LABEL: test_med3_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_med3_f16 v2, v2, v3, v4
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
+  %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
+  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
+  %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+  store half %tmp3, half addrspace(1)* %arg
+  ret void
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare half @llvm.minnum.f16(half, half)
+declare half @llvm.maxnum.f16(half, half)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+